Skip to content

Commit

Permalink
Updated embedding model (text-embedding-3-small) and openai API access
Browse files Browse the repository at this point in the history
updated deprecated model name

replaced ivvfalt with streamingdiskann

final fixes

Updated embedding (text-embedding-3-small)  and LLM model (gpt-4o) & openai API access
  • Loading branch information
Hervé Ishimwe authored and Hervé Ishimwe committed Oct 2, 2024
1 parent 84a156e commit 73cbaf7
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 187 deletions.
258 changes: 129 additions & 129 deletions openai_pgvector_helloworld/blog_data_and_embeddings.csv

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion openai_pgvector_helloworld/blog_data_and_embeddings.json

Large diffs are not rendered by default.

122 changes: 66 additions & 56 deletions openai_pgvector_helloworld/openai_pgvector_helloworld.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,21 @@
"- Signup for an OpenAI Developer Account and create an API Key. See [OpenAI's developer platform](https://platform.openai.com/overview).\n",
"- Install Python\n",
"- Install and configure a python virtual environment. We recommend [Pyenv](https://github.com/pyenv/pyenv)\n",
"- Install the requirements for this notebook using the following command:\n",
"\n",
"```\n",
"pip install -r requirements.txt\n",
"```"
"- Install the requirements for this notebook using the following command:"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [],
"source": [
"%pip install -r requirements.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 189,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -66,7 +71,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 190,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -89,7 +94,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 191,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -112,7 +117,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 192,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -137,7 +142,7 @@
"# Assumes we're using the text-embedding-ada-002 model\n",
"# See https://openai.com/pricing\n",
"def get_embedding_cost(num_tokens):\n",
" return num_tokens/1000*0.0001\n",
" return num_tokens/1000*0.00002\n",
"\n",
"# Helper function: calculate total cost of embedding all content in the dataframe\n",
"def get_total_embeddings_cost():\n",
Expand All @@ -147,21 +152,12 @@
" token_len = num_tokens_from_string(text)\n",
" total_tokens = total_tokens + token_len\n",
" total_cost = get_embedding_cost(total_tokens)\n",
" return total_cost\n",
"\n",
"# Helper function: get embeddings for a text\n",
"def get_embeddings(text):\n",
" response = openai.Embedding.create(\n",
" model=\"text-embedding-ada-002\",\n",
" input = text.replace(\"\\n\",\" \")\n",
" )\n",
" embedding = response['data'][0]['embedding']\n",
" return embedding"
" return total_cost"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 193,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -189,14 +185,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 194,
"metadata": {},
"outputs": [],
"source": [
"###############################################################################\n",
"# Create new list with small content chunks to not hit max token limits\n",
"# Note: the maximum number of tokens for a single request is 8191\n",
"# https://openai.com/docs/api-reference/requests\n",
"# https://platform.openai.com/docs/guides/embeddings/embedding-models\n",
"###############################################################################\n",
"# list for chunked content and embeddings\n",
"new_list = []\n",
Expand Down Expand Up @@ -241,7 +237,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 195,
"metadata": {},
"outputs": [],
"source": [
"openai_client = openai.OpenAI()\n",
"\n",
"# Helper function: get embeddings for a text\n",
"def get_embeddings(text):\n",
" response = openai_client.embeddings.create(\n",
" model=\"text-embedding-3-small\",\n",
" input = text.replace(\"\\n\",\" \")\n",
" )\n",
" return response.data[0].embedding"
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -258,14 +271,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 197,
"metadata": {},
"outputs": [],
"source": [
"# Save the dataframe with embeddings as a CSV file\n",
"df_new.to_csv('blog_data_and_embeddings.csv', index=False)\n",
"# It may also be useful to save as a json file, but we won't use this in the tutorial\n",
"#df_new.to_json('blog_data_and_embeddings.json')"
"#df_new.to_json('blog_data_and_embeddings.json') "
]
},
{
Expand All @@ -291,7 +304,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 198,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -304,7 +317,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 199,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -313,7 +326,11 @@
"cur = conn.cursor()\n",
"\n",
"#install pgvector \n",
"cur.execute(\"CREATE EXTENSION IF NOT EXISTS vector\");\n",
"cur.execute(\"CREATE EXTENSION IF NOT EXISTS vector;\")\n",
"conn.commit()\n",
"\n",
"#install pgvectorscale \n",
"cur.execute(\"CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE;\")\n",
"conn.commit()\n",
"\n",
"# Register the vector type with psycopg2\n",
Expand Down Expand Up @@ -346,7 +363,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 200,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -381,7 +398,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 201,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -391,7 +408,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 202,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -409,7 +426,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 203,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -433,7 +450,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 204,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -445,7 +462,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 205,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -465,24 +482,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 206,
"metadata": {},
"outputs": [],
"source": [
"# Create an index on the data for faster retrieval\n",
"# this isn't really needed for 129 vectors, but it shows the usage for larger datasets\n",
"# Note: always create this type of index after you have data already inserted into the DB\n",
"\n",
"#calculate the index parameters according to best practices\n",
"num_lists = num_records / 1000\n",
"if num_lists < 10:\n",
" num_lists = 10\n",
"if num_records > 1000000:\n",
" num_lists = math.sqrt(num_records)\n",
"\n",
"#use the cosine distance measure, which is what we'll later use for querying\n",
"cur.execute(f'CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = {num_lists});')\n",
"conn.commit() "
"# for different tuning suggestions check this: https://github.com/timescale/pgvectorscale?tab=readme-ov-file#tuning\n",
"cur.execute('CREATE INDEX embedding_idx ON embeddings USING diskann (embedding);')\n",
"conn.commit()"
]
},
{
Expand All @@ -499,26 +509,26 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 207,
"metadata": {},
"outputs": [],
"source": [
"# Helper function: get text completion from OpenAI API\n",
"# Note max tokens is 4097\n",
"# Note we're using the latest gpt-3.5-turbo-0613 model\n",
"def get_completion_from_messages(messages, model=\"gpt-3.5-turbo-0613\", temperature=0, max_tokens=1000):\n",
" response = openai.ChatCompletion.create(\n",
"def get_completion_from_messages(messages, model=\"gpt-4o\", temperature=0, max_tokens=1000):\n",
" response = openai_client.chat.completions.create(\n",
" model=model,\n",
" messages=messages,\n",
" temperature=temperature, \n",
" max_tokens=max_tokens, \n",
" )\n",
" return response.choices[0].message[\"content\"]"
" return response.choices[0].message.content"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 208,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -547,7 +557,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 209,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -557,7 +567,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 210,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -590,7 +600,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 211,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -601,7 +611,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 212,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -629,7 +639,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
"version": "3.9.6"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion openai_pgvector_helloworld/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ openai
pandas
numpy
tiktoken
psycopg2
psycopg2-binary
pgvector
python-dotenv

0 comments on commit 73cbaf7

Please sign in to comment.