Updated embedding model (text-embedding-3-small) and openai API access

updated deprecated model name replaced ivvfalt with streamingdiskann final fixes Updated embedding (text-embedding-3-small) and LLM model (gpt-4o) & openai API access
timescale · Oct 2, 2024 · 73cbaf7 · 73cbaf7
1 parent 84a156e
commit 73cbaf7
Show file tree

Hide file tree

Showing 4 changed files with 197 additions and 187 deletions.
diff --git a/openai_pgvector_helloworld/blog_data_and_embeddings.csv b/openai_pgvector_helloworld/blog_data_and_embeddings.csv
diff --git a/openai_pgvector_helloworld/blog_data_and_embeddings.json b/openai_pgvector_helloworld/blog_data_and_embeddings.json
diff --git a/openai_pgvector_helloworld/openai_pgvector_helloworld.ipynb b/openai_pgvector_helloworld/openai_pgvector_helloworld.ipynb
@@ -37,16 +37,21 @@
     "- Signup for an OpenAI Developer Account and create an API Key. See [OpenAI's developer platform](https://platform.openai.com/overview).\n",
     "- Install Python\n",
     "- Install and configure a python virtual environment. We recommend [Pyenv](https://github.com/pyenv/pyenv)\n",
-    "- Install the requirements for this notebook using the following command:\n",
-    "\n",
-    "```\n",
-    "pip install -r requirements.txt\n",
-    "```"
+    "- Install the requirements for this notebook using the following command:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 188,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 189,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -66,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 190,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -89,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 191,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -112,7 +117,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 192,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -137,7 +142,7 @@
     "# Assumes we're using the text-embedding-ada-002 model\n",
     "# See https://openai.com/pricing\n",
     "def get_embedding_cost(num_tokens):\n",
-    "    return num_tokens/1000*0.0001\n",
+    "    return num_tokens/1000*0.00002\n",
     "\n",
     "# Helper function: calculate total cost of embedding all content in the dataframe\n",
     "def get_total_embeddings_cost():\n",
@@ -147,21 +152,12 @@
     "        token_len = num_tokens_from_string(text)\n",
     "        total_tokens = total_tokens + token_len\n",
     "    total_cost = get_embedding_cost(total_tokens)\n",
-    "    return total_cost\n",
-    "\n",
-    "# Helper function: get embeddings for a text\n",
-    "def get_embeddings(text):\n",
-    "    response = openai.Embedding.create(\n",
-    "        model=\"text-embedding-ada-002\",\n",
-    "        input = text.replace(\"\\n\",\" \")\n",
-    "    )\n",
-    "    embedding = response['data'][0]['embedding']\n",
-    "    return embedding"
+    "    return total_cost"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 193,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,14 +185,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 194,
    "metadata": {},
    "outputs": [],
    "source": [
     "###############################################################################\n",
     "# Create new list with small content chunks to not hit max token limits\n",
     "# Note: the maximum number of tokens for a single request is 8191\n",
-    "# https://openai.com/docs/api-reference/requests\n",
+    "# https://platform.openai.com/docs/guides/embeddings/embedding-models\n",
     "###############################################################################\n",
     "# list for chunked content and embeddings\n",
     "new_list = []\n",
@@ -241,7 +237,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 195,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "openai_client = openai.OpenAI()\n",
+    "\n",
+    "# Helper function: get embeddings for a text\n",
+    "def get_embeddings(text):\n",
+    "    response = openai_client.embeddings.create(\n",
+    "        model=\"text-embedding-3-small\",\n",
+    "        input = text.replace(\"\\n\",\" \")\n",
+    "    )\n",
+    "    return response.data[0].embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 196,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -258,14 +271,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 197,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Save the dataframe with embeddings as a CSV file\n",
     "df_new.to_csv('blog_data_and_embeddings.csv', index=False)\n",
     "# It may also be useful to save as a json file, but we won't use this in the tutorial\n",
-    "#df_new.to_json('blog_data_and_embeddings.json')"
+    "#df_new.to_json('blog_data_and_embeddings.json') "
    ]
   },
   {
@@ -291,7 +304,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 198,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -304,7 +317,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 199,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -313,7 +326,11 @@
     "cur = conn.cursor()\n",
     "\n",
     "#install pgvector \n",
-    "cur.execute(\"CREATE EXTENSION IF NOT EXISTS vector\");\n",
+    "cur.execute(\"CREATE EXTENSION IF NOT EXISTS vector;\")\n",
+    "conn.commit()\n",
+    "\n",
+    "#install pgvectorscale \n",
+    "cur.execute(\"CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE;\")\n",
     "conn.commit()\n",
     "\n",
     "# Register the vector type with psycopg2\n",
@@ -346,7 +363,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 200,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -381,7 +398,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 201,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -391,7 +408,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 202,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -409,7 +426,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 203,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -433,7 +450,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 204,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -445,7 +462,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 205,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -465,24 +482,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 206,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Create an index on the data for faster retrieval\n",
     "# this isn't really needed for 129 vectors, but it shows the usage for larger datasets\n",
     "# Note: always create this type of index after you have data already inserted into the DB\n",
     "\n",
-    "#calculate the index parameters according to best practices\n",
-    "num_lists = num_records / 1000\n",
-    "if num_lists < 10:\n",
-    "    num_lists = 10\n",
-    "if num_records > 1000000:\n",
-    "    num_lists = math.sqrt(num_records)\n",
-    "\n",
-    "#use the cosine distance measure, which is what we'll later use for querying\n",
-    "cur.execute(f'CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = {num_lists});')\n",
-    "conn.commit() "
+    "# for different tuning suggestions check this: https://github.com/timescale/pgvectorscale?tab=readme-ov-file#tuning\n",
+    "cur.execute('CREATE INDEX embedding_idx ON embeddings USING diskann (embedding);')\n",
+    "conn.commit()"
    ]
   },
   {
@@ -499,26 +509,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 207,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Helper function: get text completion from OpenAI API\n",
     "# Note max tokens is 4097\n",
     "# Note we're using the latest gpt-3.5-turbo-0613 model\n",
-    "def get_completion_from_messages(messages, model=\"gpt-3.5-turbo-0613\", temperature=0, max_tokens=1000):\n",
-    "    response = openai.ChatCompletion.create(\n",
+    "def get_completion_from_messages(messages, model=\"gpt-4o\", temperature=0, max_tokens=1000):\n",
+    "    response = openai_client.chat.completions.create(\n",
     "        model=model,\n",
     "        messages=messages,\n",
     "        temperature=temperature, \n",
     "        max_tokens=max_tokens, \n",
     "    )\n",
-    "    return response.choices[0].message[\"content\"]"
+    "    return response.choices[0].message.content"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 208,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -547,7 +557,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 209,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -557,7 +567,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 210,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -590,7 +600,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 211,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -601,7 +611,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 212,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -629,7 +639,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,

diff --git a/openai_pgvector_helloworld/requirements.txt b/openai_pgvector_helloworld/requirements.txt
@@ -2,6 +2,6 @@ openai
 pandas
 numpy
 tiktoken
-psycopg2
+psycopg2-binary
 pgvector
 python-dotenv