diff --git a/ailab-llamaindex-search/README.md b/ailab-llamaindex-search/README.md index cd918fd..f6a762f 100644 --- a/ailab-llamaindex-search/README.md +++ b/ailab-llamaindex-search/README.md @@ -2,7 +2,8 @@ ## Overview -The `ailab-llamaindex-search` package facilitates querying our custom index built using LlamaIndex and PostgresSQL. +The `ailab-llamaindex-search` package facilitates querying our custom index +built using LlamaIndex and PostgresSQL. ## Installation @@ -46,7 +47,7 @@ trans_paths = { } index = create_index_object(embed_model_params, vector_store_params) -search_results = search("your query", index, trans_paths=trans_paths) +search_results = search("your query", index, similarity_top_k=10, trans_paths=trans_paths) for result in search_results: print(result) diff --git a/ailab-llamaindex-search/ailab_llamaindex_search/__init__.py b/ailab-llamaindex-search/ailab_llamaindex_search/__init__.py index 9ffb7bd..7afc3e7 100644 --- a/ailab-llamaindex-search/ailab_llamaindex_search/__init__.py +++ b/ailab-llamaindex-search/ailab_llamaindex_search/__init__.py @@ -2,6 +2,7 @@ import dpath from llama_index.core import VectorStoreIndex +from llama_index.core.schema import NodeWithScore from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding from llama_index.vector_stores.postgres import PGVectorStore @@ -10,6 +11,15 @@ class AilabLlamaIndexSearchError(Exception): """Generic Ailab LlamaIndex search error.""" +def select_highest_scored_nodes_by_url(nodes: list[NodeWithScore]): + best_nodes: dict[str, NodeWithScore] = {} + for node in nodes: + url: str = node.metadata["url"] + if url not in best_nodes or best_nodes[url].score < node.score: + best_nodes[url] = node + return list(best_nodes.values()) + + def transform(node_dict: dict, paths: dict): if not paths: return node_dict @@ -20,16 +30,16 @@ def transform(node_dict: dict, paths: dict): def search( query: str, index: VectorStoreIndex, - search_params: dict = {}, + similarity_top_k: int = 10, trans_paths: dict = {}, ): if not query: logging.error("Empty search query received") raise AilabLlamaIndexSearchError("search query cannot be empty.") - - retriever = index.as_retriever(**search_params) + retriever = index.as_retriever(similarity_top_k=similarity_top_k * 2) nodes = retriever.retrieve(query) - return [transform(n.dict(), trans_paths) for n in nodes] + best_nodes = select_highest_scored_nodes_by_url(nodes) + return [transform(node.dict(), trans_paths) for node in best_nodes] def create_index_object(embed_model_params: dict, vector_store_params: dict): diff --git a/ailab-llamaindex-search/tests/test_ailab_llamaindex_search.py b/ailab-llamaindex-search/tests/test_ailab_llamaindex_search.py index d0b79e5..a205794 100644 --- a/ailab-llamaindex-search/tests/test_ailab_llamaindex_search.py +++ b/ailab-llamaindex-search/tests/test_ailab_llamaindex_search.py @@ -1,13 +1,15 @@ import unittest -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, call, patch from ailab_llamaindex_search import ( AilabLlamaIndexSearchError, VectorStoreIndex, create_index_object, search, + select_highest_scored_nodes_by_url, transform, ) +from llama_index.core.schema import NodeWithScore, TextNode class TestAilabLlamaTransform(unittest.TestCase): @@ -45,35 +47,99 @@ class TestAilabLlamaSearch(unittest.TestCase): def setUp(self): self.mock_index = MagicMock(spec=VectorStoreIndex) self.mock_retriever = MagicMock() - self.mock_retriever.retrieve.return_value = [MagicMock(dict=MagicMock(return_value={'id': 1, 'name': 'Test Node'}))] self.mock_index.as_retriever.return_value = self.mock_retriever - + def test_search_with_empty_query_error(self): with self.assertRaises(AilabLlamaIndexSearchError): search("", self.mock_index) - @patch('ailab_llamaindex_search.transform') - def test_search_calls_transform_on_results(self, mock_transform): - mock_transform.return_value = {'id': 1, 'name': 'Transformed Node'} + @patch("ailab_llamaindex_search.transform") + @patch("ailab_llamaindex_search.select_highest_scored_nodes_by_url") + def test_search_calls_the_right_functions(self, mock_select, mock_transform): + d1 = {"id_": "1", "metadata": {"url": "https://example.com"}} + d2 = {"id_": "2", "metadata": {"url": "https://example.com"}} + node1 = NodeWithScore(node=TextNode.from_dict(d1), score=0.8) + node2 = NodeWithScore(node=TextNode.from_dict(d2), score=0.9) + nodes = [node1, node2] + selected_nodes = [node2] + transformed_nodes = node2.dict() + self.mock_retriever.retrieve.return_value = nodes + mock_select.return_value = selected_nodes + mock_transform.side_effect = lambda node_dict, _: node_dict + results = search("test query", self.mock_index) - self.assertTrue(mock_transform.called) - self.assertEqual(len(results), 1) - self.assertEqual(results[0], {'id': 1, 'name': 'Transformed Node'}) - - @patch('ailab_llamaindex_search.AzureOpenAIEmbedding') - @patch('ailab_llamaindex_search.PGVectorStore.from_params') - @patch('ailab_llamaindex_search.VectorStoreIndex.from_vector_store') - def test_create_index_object_initializes_correctly(self, mock_from_vector_store, mock_from_params, mock_azure_openai_embedding): + mock_select.assert_called_once_with(nodes) + calls = [call(node.dict(), {}) for node in selected_nodes] + mock_transform.assert_has_calls(calls, any_order=True) + self.assertTrue(results[0] == transformed_nodes) + + @patch("ailab_llamaindex_search.select_highest_scored_nodes_by_url") + @patch("ailab_llamaindex_search.transform") + def test_retriever_similarity_top_k_parameter(self, mock_transform, mock_select): + self.mock_index.as_retriever = MagicMock() + similarity_top_k = 10 + search("valid query", self.mock_index, similarity_top_k=similarity_top_k) + self.mock_index.as_retriever.assert_called_once_with( + similarity_top_k=similarity_top_k * 2 + ) + + @patch("ailab_llamaindex_search.AzureOpenAIEmbedding") + @patch("ailab_llamaindex_search.PGVectorStore.from_params") + @patch("ailab_llamaindex_search.VectorStoreIndex.from_vector_store") + def test_create_index_object_initializes_correctly( + self, mock_from_vector_store, mock_from_params, mock_azure_openai_embedding + ): mock_embed_model = MagicMock() mock_azure_openai_embedding.return_value = mock_embed_model mock_vector_store = MagicMock() mock_from_params.return_value = mock_vector_store mock_index_object = MagicMock() mock_from_vector_store.return_value = mock_index_object - embed_model_params = {'param1': 'value1'} - vector_store_params = {'param2': 'value2'} + embed_model_params = {"param1": "value1"} + vector_store_params = {"param2": "value2"} result = create_index_object(embed_model_params, vector_store_params) mock_azure_openai_embedding.assert_called_once_with(**embed_model_params) mock_from_params.assert_called_once_with(**vector_store_params) - mock_from_vector_store.assert_called_once_with(mock_vector_store, mock_embed_model) + mock_from_vector_store.assert_called_once_with( + mock_vector_store, mock_embed_model + ) self.assertEqual(result, mock_index_object) + + +class TestSelectHighestScoredNodesByURL(unittest.TestCase): + + def test_empty_input(self): + self.assertEqual(select_highest_scored_nodes_by_url([]), []) + + def test_single_node(self): + node_data = {"id_": "1", "metadata": {"url": "https://example.com"}} + node = NodeWithScore(node=TextNode.from_dict(node_data), score=1.0) + self.assertEqual(select_highest_scored_nodes_by_url([node]), [node]) + + def test_multiple_nodes_one_url(self): + node_data1 = {"id_": "1", "metadata": {"url": "https://example.com"}} + node_data2 = {"id_": "2", "metadata": {"url": "https://example.com"}} + node1 = NodeWithScore(node=TextNode.from_dict(node_data1), score=1.0) + node2 = NodeWithScore(node=TextNode.from_dict(node_data2), score=2.0) + self.assertEqual(select_highest_scored_nodes_by_url([node1, node2]), [node2]) + + def test_multiple_nodes_multiple_urls(self): + node_data1 = {"id_": "1", "metadata": {"url": "https://example.com"}} + node_data2 = {"id_": "2", "metadata": {"url": "https://example.com"}} + node_data3 = {"id_": "3", "metadata": {"url": "https://example2.com"}} + node1 = NodeWithScore(node=TextNode.from_dict(node_data1), score=1.0) + node2 = NodeWithScore(node=TextNode.from_dict(node_data2), score=2.0) + node3 = NodeWithScore(node=TextNode.from_dict(node_data3), score=3.0) + result = select_highest_scored_nodes_by_url([node1, node2, node3]) + self.assertIn(node2, result) + self.assertIn(node3, result) + self.assertEqual(len(result), 2) + + def test_nodes_with_same_score(self): + node_data1 = {"id_": "1", "metadata": {"url": "https://example.com"}} + node_data2 = {"id_": "2", "metadata": {"url": "https://example.com"}} + node1 = NodeWithScore(node=TextNode.from_dict(node_data1), score=1.0) + node2 = NodeWithScore(node=TextNode.from_dict(node_data2), score=1.0) + result = select_highest_scored_nodes_by_url([node1, node2]) + self.assertIn(node1, result) + self.assertEqual(len(result), 1) diff --git a/ailab-llamaindex-search/tests/test_integration.py b/ailab-llamaindex-search/tests/test_integration.py index 818110f..992d138 100644 --- a/ailab-llamaindex-search/tests/test_integration.py +++ b/ailab-llamaindex-search/tests/test_integration.py @@ -17,16 +17,13 @@ def setUp(self): os.getenv("LLAMAINDEX_DB_VECTOR_STORE_PARAMS") ) self.trans_paths = json.loads(os.getenv("LLAMAINDEX_DB_TRANS_PATHS")) - self.search_params = {"similarity_top_k": 5} self.index = create_index_object( self.embed_model_params, self.vector_store_params ) def test_search(self): query = "steps and considerations of the sampling procedures for food safety" - results = search(query, self.index, self.search_params, self.trans_paths) - n = self.search_params["similarity_top_k"] - self.assertEqual(len(results), n) + results = search(query, self.index, 10, self.trans_paths) for result in results: for key in self.trans_paths.keys(): self.assertIn(key, result) diff --git a/docs/img/pagination_caching_sequence.png b/docs/img/pagination_caching_sequence.png new file mode 100644 index 0000000..e3f7c93 Binary files /dev/null and b/docs/img/pagination_caching_sequence.png differ diff --git a/docs/puml/pagination_caching_sequence.puml b/docs/puml/pagination_caching_sequence.puml new file mode 100644 index 0000000..3405413 --- /dev/null +++ b/docs/puml/pagination_caching_sequence.puml @@ -0,0 +1,52 @@ +@startuml search sequence + +actor user +participant ":Flask" as app +database "cache" as cache +participant ":Config" as config +participant ":VectorStoreIndex" as index +participant ":BaseRetriever" as retriever +entity "ada:EmbedModel" as ada +database "llamaindex_db" as data + + +user -> app: POST /search/llamaindex\nparams: query, top, skip +app -> cache: get results for query +alt no cached results +activate app +app -> config: get index +app -> index: get retriever +activate index +create retriever +index -> retriever: create(similarity_top_k=high) +index --> app: retriever +deactivate index +app -> retriever: retrieve(query) +activate retriever +retriever -> ada: get embeddings for query +retriever -> data: match embeddings +activate data +return matching nodes +return nodes +app -> app: filter out\nduplicate url nodes +app -> app: transform nodes +app -> cache: set results for query +end alt +app -> user: slice results from skip to top +deactivate app + + +legend + +cfia.ai-ia.acia@inspection.gc.ca +kotchikpaguy-landry.allagbe@inspection.gc.ca +2024-03-21 +end legend +@enduml diff --git a/notebooks/issue12-duplicates.ipynb b/notebooks/issue12-duplicates.ipynb new file mode 100644 index 0000000..6403141 --- /dev/null +++ b/notebooks/issue12-duplicates.ipynb @@ -0,0 +1,813 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Issue [#12](https://github.com/ai-cfia/llamaindex-db/issues/12)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from llama_index.core import VectorStoreIndex\n", + "from llama_index.vector_stores.postgres import PGVectorStore\n", + "from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding\n", + "from llama_index.llms.azure_openai import AzureOpenAI\n", + "from llama_index.core import Settings\n", + "from llama_index.core.schema import NodeWithScore, TextNode\n", + "import os\n", + "from dotenv import load_dotenv\n", + "import psycopg\n", + "from psycopg.rows import dict_row\n", + "from pprint import pprint\n", + "\n", + "load_dotenv()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup LLM and Embed Model\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "llm = AzureOpenAI(\n", + " model=\"gpt-4\",\n", + " deployment_name=\"ailab-llm\",\n", + " api_key=os.getenv(\"API_KEY\"),\n", + " azure_endpoint=os.getenv(\"AZURE_ENDPOINT\"),\n", + " api_version=os.getenv(\"API_VERSION\"),\n", + ")\n", + "\n", + "embed_model = AzureOpenAIEmbedding(\n", + " model=\"text-embedding-ada-002\",\n", + " deployment_name=\"ada\",\n", + " api_key=os.getenv(\"API_KEY\"),\n", + " azure_endpoint=os.getenv(\"AZURE_ENDPOINT\"),\n", + " api_version=os.getenv(\"API_VERSION\"),\n", + ")\n", + "\n", + "Settings.llm = llm\n", + "Settings.embed_model = embed_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Variables\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "database = os.getenv(\"DB_NAME\")\n", + "host = os.getenv(\"DB_HOST\")\n", + "password = os.getenv(\"DB_PASSWORD\")\n", + "port = os.getenv(\"DB_PORT\")\n", + "user = os.getenv(\"DB_USER\")\n", + "llamaindex_db = \"llamaindex_db_legacy\"\n", + "llamaindex_schema = \"v_0_0_1\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Observed problem\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store = PGVectorStore.from_params(\n", + " database=llamaindex_db,\n", + " host=host,\n", + " password=password,\n", + " port=port,\n", + " user=user,\n", + " embed_dim=1536,\n", + ")\n", + "\n", + "index = VectorStoreIndex.from_vector_store(vector_store=vector_store)\n", + "retriever = index.as_retriever(similarity_top_k=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"what are the fertilizer labelling requirements?\"\n", + "nodes = retriever.retrieve(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'meta': {'chunk_id': '1854fdc5-af24-41e4-81ef-a742a08c6684',\n", + " 'id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'last_updated': '2020-11-13',\n", + " 'score': 0.534927215363663,\n", + " 'subtitle': 'Registered Fertilizer-Pesticides Labelling',\n", + " 'title': 'T-4- 102 - Requirements for fertilizer-pesticides under '\n", + " 'the Fertilizers Act - Canadian Food Inspection Agency',\n", + " 'tokens_count': 305,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-102/eng/1307854513877/1307854674148'},\n", + " 'node_id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'score': 0.9085487454359314}\n", + "{'meta': {'chunk_id': 'f57c95ef-1dd7-4d03-886e-d82b7fa22563',\n", + " 'id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'last_updated': '2020-11-13',\n", + " 'score': 0.534927215363663,\n", + " 'subtitle': 'Exemptions from Registration;Customer Formula '\n", + " 'Fertilizer-Pesticide Labelling',\n", + " 'title': 'T-4- 102 - Requirements for fertilizer-pesticides under '\n", + " 'the Fertilizers Act - Canadian Food Inspection Agency',\n", + " 'tokens_count': 273,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-102/eng/1307854513877/1307854674148'},\n", + " 'node_id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'score': 0.9085487454359314}\n", + "{'meta': {'chunk_id': '09f04937-a60d-4a5d-a2bd-1754101d7ca8',\n", + " 'id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'last_updated': '2020-11-13',\n", + " 'score': 0.534927215363663,\n", + " 'subtitle': '6. Contact information',\n", + " 'title': 'T-4- 102 - Requirements for fertilizer-pesticides under '\n", + " 'the Fertilizers Act - Canadian Food Inspection Agency',\n", + " 'tokens_count': 78,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-102/eng/1307854513877/1307854674148'},\n", + " 'node_id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'score': 0.9085487454359314}\n", + "{'meta': {'chunk_id': '7cd497c1-dcf7-4ad7-bdb8-d0d60c997bba',\n", + " 'id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'last_updated': '2020-11-13',\n", + " 'score': 0.534927215363663,\n", + " 'subtitle': '4. Labelling;5. Compendium of fertilizer-use pesticides',\n", + " 'title': 'T-4- 102 - Requirements for fertilizer-pesticides under '\n", + " 'the Fertilizers Act - Canadian Food Inspection Agency',\n", + " 'tokens_count': 437,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-102/eng/1307854513877/1307854674148'},\n", + " 'node_id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'score': 0.9085487454359314}\n", + "{'meta': {'chunk_id': 'a5c3fa21-259b-43b9-a59f-5083a2f9f120',\n", + " 'id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'last_updated': '2020-11-13',\n", + " 'score': 0.534927215363663,\n", + " 'subtitle': '1. Purpose;2. Standards;3. Registration requirements',\n", + " 'title': 'T-4- 102 - Requirements for fertilizer-pesticides under '\n", + " 'the Fertilizers Act - Canadian Food Inspection Agency',\n", + " 'tokens_count': 358,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-102/eng/1307854513877/1307854674148'},\n", + " 'node_id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'score': 0.9085487454359314}\n", + "{'meta': {'chunk_id': '3fbafdcd-6a0e-44ac-8901-867aa3ec94ff',\n", + " 'id': '61cfc05e-565a-42eb-8ed0-5908a98ea816',\n", + " 'last_updated': '2022-07-15',\n", + " 'score': 0.5726020874138481,\n", + " 'subtitle': 'Label verification;Marketplace monitoring',\n", + " 'title': 'Fertilizer Program overview - Canadian Food Inspection '\n", + " 'Agency',\n", + " 'tokens_count': 233,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/program-overview/eng/1330891097865/1330891293225'},\n", + " 'node_id': '61cfc05e-565a-42eb-8ed0-5908a98ea816',\n", + " 'score': 0.886424290837645}\n", + "{'meta': {'chunk_id': '6e794a39-ad46-45e0-ae2f-48a598dfe87b',\n", + " 'id': '53d3cd70-bcd5-4b2d-b2e1-27237a0cfc07',\n", + " 'last_updated': '2022-01-06',\n", + " 'score': 0.49127928626424366,\n", + " 'subtitle': '5.2.3 Third review',\n", + " 'title': 'T-4- 122 – Service delivery standards for fertilizer and '\n", + " 'supplement registration-related submissions under the '\n", + " 'Fertilizers Act and regulations - Canadian Food Inspection '\n", + " 'Agency',\n", + " 'tokens_count': 145,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-122/eng/1305609994431/1307910971122'},\n", + " 'node_id': '53d3cd70-bcd5-4b2d-b2e1-27237a0cfc07',\n", + " 'score': 0.8705808520318067}\n", + "{'meta': {'chunk_id': '3791be04-4f09-4c83-a22d-fe7bf19908e2',\n", + " 'id': '53d3cd70-bcd5-4b2d-b2e1-27237a0cfc07',\n", + " 'last_updated': '2022-01-06',\n", + " 'score': 0.49127928626424366,\n", + " 'subtitle': '5.2.1 First review;5.2.2 Second review',\n", + " 'title': 'T-4- 122 – Service delivery standards for fertilizer and '\n", + " 'supplement registration-related submissions under the '\n", + " 'Fertilizers Act and regulations - Canadian Food Inspection '\n", + " 'Agency',\n", + " 'tokens_count': 369,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-122/eng/1305609994431/1307910971122'},\n", + " 'node_id': '53d3cd70-bcd5-4b2d-b2e1-27237a0cfc07',\n", + " 'score': 0.8705808520318067}\n", + "{'meta': {'chunk_id': '1af065f1-8c27-4f40-ab7d-49215bc93f49',\n", + " 'id': '53d3cd70-bcd5-4b2d-b2e1-27237a0cfc07',\n", + " 'last_updated': '2022-01-06',\n", + " 'score': 0.49127928626424366,\n", + " 'subtitle': '5.1 Pre-screening',\n", + " 'title': 'T-4- 122 – Service delivery standards for fertilizer and '\n", + " 'supplement registration-related submissions under the '\n", + " 'Fertilizers Act and regulations - Canadian Food Inspection '\n", + " 'Agency',\n", + " 'tokens_count': 399,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-122/eng/1305609994431/1307910971122'},\n", + " 'node_id': '53d3cd70-bcd5-4b2d-b2e1-27237a0cfc07',\n", + " 'score': 0.8705808520318067}\n", + "{'meta': {'chunk_id': '088fbaed-f0d8-4ab2-a614-61e556e59be3',\n", + " 'id': '53d3cd70-bcd5-4b2d-b2e1-27237a0cfc07',\n", + " 'last_updated': '2022-01-06',\n", + " 'score': 0.49127928626424366,\n", + " 'subtitle': '4.5 Ingredient Source Change Inquiry (ISCI)',\n", + " 'title': 'T-4- 122 – Service delivery standards for fertilizer and '\n", + " 'supplement registration-related submissions under the '\n", + " 'Fertilizers Act and regulations - Canadian Food Inspection '\n", + " 'Agency',\n", + " 'tokens_count': 306,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-122/eng/1305609994431/1307910971122'},\n", + " 'node_id': '53d3cd70-bcd5-4b2d-b2e1-27237a0cfc07',\n", + " 'score': 0.8705808520318067}\n" + ] + } + ], + "source": [ + "for n in nodes:\n", + " pprint({\"meta\": n.metadata, \"score\": n.score, \"node_id\": n.node_id})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can observe that multiple documents reference the same url (document).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Root cause\n", + "\n", + "A long enough document is split into chunks (html sections in our case). A node is a chunk and all it's metadata. A query's vector can simultaneously be similar to multiple nodes in the same document. For instance the subject of `fertilizer labelling requirements` might span multiple sections in the original webpage. Even then, we should expect nodes from the same documents to have different scores. It's not the case here, which suggests that there is a deeper issue here.\n", + "\n", + "I just noticed: nodes from the same document have the same `node_id`. This suggest that the `node_ids` are referencing the document instead of the chunks. Indeed, looking at the node creation code in [pgvector_ailab_db.ipynb](./pgvector_ailab_db.ipynb), in section \"Creating nodes from louis_v005.documents\" at the moment this was written, we can see that it's the case.\n", + "\n", + "```python\n", + "query = \"\"\"\n", + " SELECT id, content, embedding, chunk_id, url, title, subtitle, tokens_count, last_updated, score\n", + " FROM louis_v005.documents\n", + "\"\"\"\n", + "nodes = []\n", + "with psycopg.connect(conn_string) as conn:\n", + " with conn.cursor(row_factory=dict_row) as cur:\n", + " results = cur.execute(query).fetchall()\n", + " for r in tqdm(results, desc=\"Processing records\"):\n", + " node = TextNode(\n", + " text=r[\"content\"],\n", + " id_=str(r[\"id\"]), # <---- Here\n", + " embedding=json.loads(r[\"embedding\"]),\n", + " )\n", + "```\n", + "\n", + "Something else I noticed: many of the nodes that reference the same documents have the same embedding, which is virtually impossible.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8780a201-a628-44a3-babb-c8830b68de72\n" + ] + } + ], + "source": [ + "conn_string = (\n", + " f\"dbname={llamaindex_db} \"\n", + " f\"user={user} \"\n", + " f\"password={password} \"\n", + " f\"host={host} \"\n", + " f\"port={port}\"\n", + ")\n", + "\n", + "node_id = nodes[0].node_id\n", + "print(node_id)\n", + "\n", + "query = \"\"\"\n", + " SELECT node_id, embedding\n", + " FROM public.data_llamaindex\n", + " WHERE node_id = %s\n", + "\"\"\"\n", + "with psycopg.connect(conn_string) as conn:\n", + " with conn.cursor(row_factory=dict_row) as cur:\n", + " results = cur.execute(query, (node_id,)).fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pprint(results)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "results[1] and results[2] are the same: True\n" + ] + } + ], + "source": [ + "print(\"results[1] and results[2] are the same:\", results[1] == results[2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For comparison, here are the chunks from the same document in `louis_v005`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8780a201-a628-44a3-babb-c8830b68de72\n" + ] + } + ], + "source": [ + "conn_string = (\n", + " f\"dbname={database} \"\n", + " f\"user={user} \"\n", + " f\"password={password} \"\n", + " f\"host={host} \"\n", + " f\"port={port}\"\n", + ")\n", + "\n", + "node_id = nodes[0].node_id\n", + "print(node_id)\n", + "\n", + "query = \"\"\"\n", + " SELECT id, chunk_id, embedding\n", + " FROM louis_v005.documents\n", + " WHERE id = %s\n", + "\"\"\"\n", + "with psycopg.connect(conn_string) as conn:\n", + " with conn.cursor(row_factory=dict_row) as cur:\n", + " results = cur.execute(query, (node_id,)).fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pprint(results)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "results[1] and results[2] are the same: False\n" + ] + } + ], + "source": [ + "print(\"results[1] and results[2] are the same:\", results[1] == results[2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the embeddings are never the same. So, in the node creation process, probably due to using the same (document) id for nodes, they were duplicated.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fix wrong `node_ids`\n", + "\n", + "To fix this, we will have to modify the node creation code and rebuild the index. Fortunately, all the embeddings are already created and should not generate cost. In my estimation, the only costs would be due to db read write operations.\n", + "\n", + "New node generation code:\n", + "\n", + "```python\n", + "#...\n", + " node = TextNode(\n", + " text=r[\"content\"],\n", + " id_=str(r[\"chunk_id\"]), # changed \"id\" to \"chunk_id\"\n", + " embedding=json.loads(r[\"embedding\"]),\n", + " )\n", + "#...\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing the `node_id` fix\n" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store = PGVectorStore.from_params(\n", + " database=llamaindex_db,\n", + " host=host,\n", + " password=password,\n", + " port=port,\n", + " user=user,\n", + " schema_name=llamaindex_schema,\n", + " embed_dim=1536,\n", + ")\n", + "\n", + "index = VectorStoreIndex.from_vector_store(vector_store=vector_store)\n", + "retriever = index.as_retriever(similarity_top_k=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"what are the fertilizer labelling requirements?\"\n", + "nodes = retriever.retrieve(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'meta': {'chunk_id': '1854fdc5-af24-41e4-81ef-a742a08c6684',\n", + " 'id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'last_updated': '2020-11-13',\n", + " 'score': 0.534927215363663,\n", + " 'subtitle': 'Registered Fertilizer-Pesticides Labelling',\n", + " 'title': 'T-4- 102 - Requirements for fertilizer-pesticides under '\n", + " 'the Fertilizers Act - Canadian Food Inspection Agency',\n", + " 'tokens_count': 305,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-102/eng/1307854513877/1307854674148'},\n", + " 'node_id': '1854fdc5-af24-41e4-81ef-a742a08c6684',\n", + " 'score': 0.9085487454359314}\n", + "{'meta': {'chunk_id': '8144cb04-e745-49a4-b68b-809a700dee90',\n", + " 'id': '1ca75f55-e758-4830-9226-0577f9220482',\n", + " 'last_updated': '2022-06-08',\n", + " 'score': 0.5186214394910862,\n", + " 'subtitle': 'IV. Labelling',\n", + " 'title': 'T-4- 120 – Regulation of compost under the Fertilizers Act '\n", + " 'and Regulations - Canadian Food Inspection Agency',\n", + " 'tokens_count': 66,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-120/eng/1307910204607/1307910352783'},\n", + " 'node_id': '8144cb04-e745-49a4-b68b-809a700dee90',\n", + " 'score': 0.9001759946242166}\n", + "{'meta': {'chunk_id': 'fbcefc20-e2d3-4ce6-bc4d-442790ed5b6e',\n", + " 'id': '1ca75f55-e758-4830-9226-0577f9220482',\n", + " 'last_updated': '2022-06-08',\n", + " 'score': 0.5186214394910862,\n", + " 'subtitle': '4. Labelling;5. Import and export',\n", + " 'title': 'T-4- 120 – Regulation of compost under the Fertilizers Act '\n", + " 'and Regulations - Canadian Food Inspection Agency',\n", + " 'tokens_count': 463,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-120/eng/1307910204607/1307910352783'},\n", + " 'node_id': 'fbcefc20-e2d3-4ce6-bc4d-442790ed5b6e',\n", + " 'score': 0.8976572460957771}\n", + "{'meta': {'chunk_id': 'cd8b99b1-ff30-4f9c-97fe-7411d85e9917',\n", + " 'id': 'fa6f8f8b-7242-4632-b7a3-a2881dd90d44',\n", + " 'last_updated': '2020-11-13',\n", + " 'score': 0.4413583232153546,\n", + " 'subtitle': '4. Labelling;5. Application to land and disposal',\n", + " 'title': 'T-4- 112 – By-products and other Waste-derived materials '\n", + " 'sold as fertilizers or supplements - Canadian Food '\n", + " 'Inspection Agency',\n", + " 'tokens_count': 301,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-112/eng/1307864536371/1320192988468'},\n", + " 'node_id': 'cd8b99b1-ff30-4f9c-97fe-7411d85e9917',\n", + " 'score': 0.8968562133655295}\n", + "{'meta': {'chunk_id': '53aa18d3-bb87-428f-92d8-c77239382110',\n", + " 'id': '087656d4-6033-411d-85b3-aaff7193df6f',\n", + " 'last_updated': '2021-05-04',\n", + " 'score': 0.5603678073399798,\n", + " 'subtitle': '4. Registration application and labelling requirements',\n", + " 'title': 'T-4- 129 – Requirements for micronutrient fertilizers - '\n", + " 'Canadian Food Inspection Agency',\n", + " 'tokens_count': 293,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-129/eng/1584933233868/1584933234227'},\n", + " 'node_id': '53aa18d3-bb87-428f-92d8-c77239382110',\n", + " 'score': 0.8958321542790785}\n", + "{'meta': {'chunk_id': '91b5eef0-89af-4eeb-8cf8-a82e77172fe7',\n", + " 'id': 'c1193ed0-caa0-4d8b-950d-8e3a7c55c6bd',\n", + " 'last_updated': '2021-12-09',\n", + " 'score': 0.501755443654494,\n", + " 'subtitle': 'On this page',\n", + " 'title': 'T-4- 130 – Labeling requirements for fertilizers and '\n", + " 'supplements - Canadian Food Inspection Agency',\n", + " 'tokens_count': 258,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-130/eng/1604424185581/1604424268008'},\n", + " 'node_id': '91b5eef0-89af-4eeb-8cf8-a82e77172fe7',\n", + " 'score': 0.8948309923024215}\n", + "{'meta': {'chunk_id': 'a7557d42-153b-4a82-94fd-06485e72a2fd',\n", + " 'id': 'c1193ed0-caa0-4d8b-950d-8e3a7c55c6bd',\n", + " 'last_updated': '2021-12-09',\n", + " 'score': 0.501755443654494,\n", + " 'subtitle': '1. Purpose;2. General regulatory labelling requirements',\n", + " 'title': 'T-4- 130 – Labeling requirements for fertilizers and '\n", + " 'supplements - Canadian Food Inspection Agency',\n", + " 'tokens_count': 385,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-130/eng/1604424185581/1604424268008'},\n", + " 'node_id': 'a7557d42-153b-4a82-94fd-06485e72a2fd',\n", + " 'score': 0.8939795873238195}\n", + "{'meta': {'chunk_id': '76cab52f-4319-4794-ae65-597e71f63014',\n", + " 'id': '3580764b-afd9-44e1-a112-4fa4ce82b32f',\n", + " 'last_updated': '2022-02-10',\n", + " 'score': 0.494491397467521,\n", + " 'subtitle': '4. Registration application and labelling requirements',\n", + " 'title': 'T-4- 109 – Requirements for microbial supplements - '\n", + " 'Canadian Food Inspection Agency',\n", + " 'tokens_count': 295,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-109/eng/1307863769159/1307863893593'},\n", + " 'node_id': '76cab52f-4319-4794-ae65-597e71f63014',\n", + " 'score': 0.8939686340755415}\n", + "{'meta': {'chunk_id': '614b1537-6442-45d0-9815-d0dacb85d82a',\n", + " 'id': '95d1c453-6a57-43dc-868d-d9df3a518ed3',\n", + " 'last_updated': '2022-02-10',\n", + " 'score': 0.4981329550451097,\n", + " 'subtitle': '3. Labelling requirements – Fertilizers Act',\n", + " 'title': 'T-4- 105 – Requirements for seeds treated with fertilizers '\n", + " 'or supplements - Canadian Food Inspection Agency',\n", + " 'tokens_count': 465,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-105/eng/1307856047195/1307856175577'},\n", + " 'node_id': '614b1537-6442-45d0-9815-d0dacb85d82a',\n", + " 'score': 0.8896327746352795}\n", + "{'meta': {'chunk_id': '7cd497c1-dcf7-4ad7-bdb8-d0d60c997bba',\n", + " 'id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'last_updated': '2020-11-13',\n", + " 'score': 0.534927215363663,\n", + " 'subtitle': '4. Labelling;5. Compendium of fertilizer-use pesticides',\n", + " 'title': 'T-4- 102 - Requirements for fertilizer-pesticides under '\n", + " 'the Fertilizers Act - Canadian Food Inspection Agency',\n", + " 'tokens_count': 437,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-102/eng/1307854513877/1307854674148'},\n", + " 'node_id': '7cd497c1-dcf7-4ad7-bdb8-d0d60c997bba',\n", + " 'score': 0.8895923162478899}\n" + ] + } + ], + "source": [ + "for n in nodes:\n", + " pprint({\"meta\": n.metadata, \"score\": n.score, \"node_id\": n.node_id})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- ✅ `node_ids` are no longer duplicated\n", + "- ✅ `scores` are no longer equal\n", + "- ❌ there are still nodes referencing the same url, but a lot less\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Solution to return only the highest score node per document (url)\n", + "\n", + "A simple temporary approach is to input a high enough `similarity_top_k` and apply a filter on the nodes to keep only the highest score node per url.\n", + "\n", + "The filter function would look like:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "def select_highest_scored_nodes_by_url(nodes: list[NodeWithScore]):\n", + " best_nodes: dict[str, NodeWithScore] = {}\n", + " for node in nodes:\n", + " url: str = node.node.metadata[\"url\"]\n", + " if url not in best_nodes or best_nodes[url].score < node.score:\n", + " best_nodes[url] = node\n", + " return list(best_nodes.values())" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "filtered_nodes = select_highest_scored_nodes_by_url(nodes)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'meta': {'chunk_id': '1854fdc5-af24-41e4-81ef-a742a08c6684',\n", + " 'id': '8780a201-a628-44a3-babb-c8830b68de72',\n", + " 'last_updated': '2020-11-13',\n", + " 'score': 0.534927215363663,\n", + " 'subtitle': 'Registered Fertilizer-Pesticides Labelling',\n", + " 'title': 'T-4- 102 - Requirements for fertilizer-pesticides under '\n", + " 'the Fertilizers Act - Canadian Food Inspection Agency',\n", + " 'tokens_count': 305,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-102/eng/1307854513877/1307854674148'},\n", + " 'node_id': '1854fdc5-af24-41e4-81ef-a742a08c6684',\n", + " 'score': 0.9085487454359314}\n", + "{'meta': {'chunk_id': '8144cb04-e745-49a4-b68b-809a700dee90',\n", + " 'id': '1ca75f55-e758-4830-9226-0577f9220482',\n", + " 'last_updated': '2022-06-08',\n", + " 'score': 0.5186214394910862,\n", + " 'subtitle': 'IV. Labelling',\n", + " 'title': 'T-4- 120 – Regulation of compost under the Fertilizers Act '\n", + " 'and Regulations - Canadian Food Inspection Agency',\n", + " 'tokens_count': 66,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-120/eng/1307910204607/1307910352783'},\n", + " 'node_id': '8144cb04-e745-49a4-b68b-809a700dee90',\n", + " 'score': 0.9001759946242166}\n", + "{'meta': {'chunk_id': 'cd8b99b1-ff30-4f9c-97fe-7411d85e9917',\n", + " 'id': 'fa6f8f8b-7242-4632-b7a3-a2881dd90d44',\n", + " 'last_updated': '2020-11-13',\n", + " 'score': 0.4413583232153546,\n", + " 'subtitle': '4. Labelling;5. Application to land and disposal',\n", + " 'title': 'T-4- 112 – By-products and other Waste-derived materials '\n", + " 'sold as fertilizers or supplements - Canadian Food '\n", + " 'Inspection Agency',\n", + " 'tokens_count': 301,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-112/eng/1307864536371/1320192988468'},\n", + " 'node_id': 'cd8b99b1-ff30-4f9c-97fe-7411d85e9917',\n", + " 'score': 0.8968562133655295}\n", + "{'meta': {'chunk_id': '53aa18d3-bb87-428f-92d8-c77239382110',\n", + " 'id': '087656d4-6033-411d-85b3-aaff7193df6f',\n", + " 'last_updated': '2021-05-04',\n", + " 'score': 0.5603678073399798,\n", + " 'subtitle': '4. Registration application and labelling requirements',\n", + " 'title': 'T-4- 129 – Requirements for micronutrient fertilizers - '\n", + " 'Canadian Food Inspection Agency',\n", + " 'tokens_count': 293,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-129/eng/1584933233868/1584933234227'},\n", + " 'node_id': '53aa18d3-bb87-428f-92d8-c77239382110',\n", + " 'score': 0.8958321542790785}\n", + "{'meta': {'chunk_id': '91b5eef0-89af-4eeb-8cf8-a82e77172fe7',\n", + " 'id': 'c1193ed0-caa0-4d8b-950d-8e3a7c55c6bd',\n", + " 'last_updated': '2021-12-09',\n", + " 'score': 0.501755443654494,\n", + " 'subtitle': 'On this page',\n", + " 'title': 'T-4- 130 – Labeling requirements for fertilizers and '\n", + " 'supplements - Canadian Food Inspection Agency',\n", + " 'tokens_count': 258,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-130/eng/1604424185581/1604424268008'},\n", + " 'node_id': '91b5eef0-89af-4eeb-8cf8-a82e77172fe7',\n", + " 'score': 0.8948309923024215}\n", + "{'meta': {'chunk_id': '76cab52f-4319-4794-ae65-597e71f63014',\n", + " 'id': '3580764b-afd9-44e1-a112-4fa4ce82b32f',\n", + " 'last_updated': '2022-02-10',\n", + " 'score': 0.494491397467521,\n", + " 'subtitle': '4. Registration application and labelling requirements',\n", + " 'title': 'T-4- 109 – Requirements for microbial supplements - '\n", + " 'Canadian Food Inspection Agency',\n", + " 'tokens_count': 295,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-109/eng/1307863769159/1307863893593'},\n", + " 'node_id': '76cab52f-4319-4794-ae65-597e71f63014',\n", + " 'score': 0.8939686340755415}\n", + "{'meta': {'chunk_id': '614b1537-6442-45d0-9815-d0dacb85d82a',\n", + " 'id': '95d1c453-6a57-43dc-868d-d9df3a518ed3',\n", + " 'last_updated': '2022-02-10',\n", + " 'score': 0.4981329550451097,\n", + " 'subtitle': '3. Labelling requirements – Fertilizers Act',\n", + " 'title': 'T-4- 105 – Requirements for seeds treated with fertilizers '\n", + " 'or supplements - Canadian Food Inspection Agency',\n", + " 'tokens_count': 465,\n", + " 'url': 'https://inspection.canada.ca/plant-health/fertilizers/trade-memoranda/t-4-105/eng/1307856047195/1307856175577'},\n", + " 'node_id': '614b1537-6442-45d0-9815-d0dacb85d82a',\n", + " 'score': 0.8896327746352795}\n", + "len(nodes): 10 len(filtered_nodes): 7\n" + ] + } + ], + "source": [ + "for n in filtered_nodes:\n", + " pprint({\"meta\": n.metadata, \"score\": n.score, \"node_id\": n.node_id})\n", + "\n", + "print(\"len(nodes):\", len(nodes), \"len(filtered_nodes):\", len(filtered_nodes))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- ✅ no duplicated urls\n", + "\n", + "Almost half the nodes have been filtered out. Let's temporarily choose `similarity_top_k` to be double the number of results the user asks, until we find a more robust solution.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Potential improvements\n", + "\n", + "- Implement pagination (with `top` and `skip` params)\n", + "- Implement a caching strategy to support pagination\n", + "\n", + "![pagination & caching](../docs/img/pagination_caching_sequence.png)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llamaindex-db", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/pgvector_ailab_db.ipynb b/notebooks/pgvector_ailab_db.ipynb index e1a2af6..6616ee7 100644 --- a/notebooks/pgvector_ailab_db.ipynb +++ b/notebooks/pgvector_ailab_db.ipynb @@ -7,7 +7,7 @@ "id": "bccd47fc" }, "source": [ - "\"Open" + "\"Open\n" ] }, { @@ -19,7 +19,7 @@ "source": [ "# Postgres Vector Store\n", "\n", - "This notebook shows how we can create a llI seeamaindex in PostgresSQL (PGVector) as opposed to in-memory, from data that has already been prepared for indexing (chunking, embeddings generations...) in `ailab-db`. \n", + "This notebook shows how we can create a llI seeamaindex in PostgresSQL (PGVector) as opposed to in-memory, from data that has already been prepared for indexing (chunking, embeddings generations...) in `ailab-db`.\n", "\n", "Testings on our azure pg show a disappointing `25 seconds` delay vs `<0.5 seconds` on local pg. It is worth investigating the configuration differences between the local pg and the azure one that could cause such a drastic jump.\n", "\n", @@ -29,8 +29,7 @@ "\n", "The delay is now `1.13 seconds` with hnsw index vs `25 seconds` without.\n", "\n", - "This is a huge improvement. We should also consider that our current azure pg instance is a development one, less powerful than the one meant for production.\n", - "\n" + "This is a huge improvement. We should also consider that our current azure pg instance is a development one, less powerful than the one meant for production.\n" ] }, { @@ -78,6 +77,7 @@ "from llama_index.storage.index_store.postgres import PostgresIndexStore\n", "from llama_index.storage.docstore.postgres import PostgresDocumentStore\n", "import psycopg\n", + "from psycopg.sql import SQL, Identifier\n", "from psycopg.rows import dict_row\n", "import json\n", "import pickle\n", @@ -107,6 +107,7 @@ " with open(filename, \"wb\") as file:\n", " pickle.dump(data, file)\n", "\n", + "\n", "def load_from_pickle(filename):\n", " with open(filename, \"rb\") as file:\n", " return pickle.load(file)" @@ -119,7 +120,7 @@ "id": "26c71b6d" }, "source": [ - "### Setup LLM and Embed Model" + "### Setup LLM and Embed Model\n" ] }, { @@ -156,7 +157,7 @@ "id": "45551f5d", "metadata": {}, "source": [ - "### Creating nodes from louis_v005.documents" + "### Creating nodes from louis_v005.documents\n" ] }, { @@ -166,28 +167,52 @@ "metadata": {}, "outputs": [], "source": [ - "database=os.getenv('DB_NAME')\n", - "host=os.getenv('DB_HOST')\n", - "password=os.getenv('DB_PASSWORD')\n", - "port=os.getenv('DB_PORT')\n", - "user=os.getenv('DB_USER')\n", - "\n", - "conn_string = (\n", - " f\"dbname={database} \"\n", - " f\"user={user} \"\n", - " f\"password={password} \"\n", - " f\"host={host} \"\n", - " f\"port={port}\"\n", - ")" + "louis_db = os.getenv(\"DB_NAME\")\n", + "host = os.getenv(\"DB_HOST\")\n", + "password = os.getenv(\"DB_PASSWORD\")\n", + "port = os.getenv(\"DB_PORT\")\n", + "user = os.getenv(\"DB_USER\")\n", + "llamaindex_db = \"llamaindex_db_legacy\"\n", + "admin_db = \"postgres\"\n", + "llamaindex_schema = \"v_0_0_1\"" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "2b126c54", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing records: 100%|██████████| 103836/103836 [01:03<00:00, 1644.90it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Node ID: a8fa477f-5a9e-493a-b50a-e435a15b1bc5\n", + "Text: 6.18 Enzymes Reserved for future use 6.19 Gut modifier\n", + "ingredients 6.19.1 Prebiotics 6.19.2 Viable microorganisms 6.19.3\n", + "Acidifiers 6.19.1 Prebiotics Reserved for future use 6.19.2 Viable\n", + "microorganisms Reserved for future use 6.19.3 Acidifiers Reserved for\n", + "future use 6.20 Forage additives 1-601-019 Propionic acid Is an\n", + "organic acid, generally e...\n" + ] + } + ], "source": [ + "conn_string = (\n", + " f\"dbname={louis_db} \"\n", + " f\"user={user} \"\n", + " f\"password={password} \"\n", + " f\"host={host} \"\n", + " f\"port={port}\"\n", + ")\n", + "\n", "query = \"\"\"\n", " SELECT id, content, embedding, chunk_id, url, title, subtitle, tokens_count, last_updated, score\n", " FROM louis_v005.documents\n", @@ -199,18 +224,18 @@ " for r in tqdm(results, desc=\"Processing records\"):\n", " node = TextNode(\n", " text=r[\"content\"],\n", - " id_=str(r[\"id\"]),\n", + " id_=str(r[\"chunk_id\"]),\n", " embedding=json.loads(r[\"embedding\"]),\n", " )\n", " node.metadata = {\n", " \"id\": str(r[\"id\"]),\n", - " 'chunk_id': str(r['chunk_id']),\n", - " 'url': r['url'],\n", - " 'title': r['title'],\n", - " 'subtitle': r['subtitle'],\n", - " 'tokens_count': r['tokens_count'],\n", - " 'last_updated': (r['last_updated']),\n", - " 'score': r['score']\n", + " \"chunk_id\": str(r[\"chunk_id\"]),\n", + " \"url\": r[\"url\"],\n", + " \"title\": r[\"title\"],\n", + " \"subtitle\": r[\"subtitle\"],\n", + " \"tokens_count\": r[\"tokens_count\"],\n", + " \"last_updated\": (r[\"last_updated\"]),\n", + " \"score\": r[\"score\"],\n", " }\n", " nodes.append(node)\n", "\n", @@ -225,27 +250,49 @@ "id": "7bd24f0a" }, "source": [ - "### Create the Database" + "### Create the Database\n" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "id": "e6d61e73", "metadata": { "id": "e6d61e73" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Database llamaindex_db_legacy already exists.\n" + ] + } + ], "source": [ - "connection_string=conn_string\n", + "connection_string = (\n", + " f\"dbname={admin_db} \"\n", + " f\"user={user} \"\n", + " f\"password={password} \"\n", + " f\"host={host} \"\n", + " f\"port={port}\"\n", + ")\n", "# connection_string = \"postgresql://postgres:testpwd@localhost:5432\"\n", - "new_database = \"llamaindexdb\"\n", "\n", - "with psycopg.connect(connection_string) as conn:\n", - " conn.autocommit = True\n", - " with conn.cursor() as cur:\n", - " cur.execute(f\"DROP DATABASE IF EXISTS {new_database}\")\n", - " cur.execute(f\"CREATE DATABASE {new_database}\")" + "# with psycopg.connect(connection_string) as conn:\n", + "# conn.autocommit = True\n", + "# with conn.cursor() as cur:\n", + "# cur.execute(f\"DROP DATABASE IF EXISTS {llama_database}\")\n", + "# cur.execute(f\"CREATE DATABASE {llama_database}\")\n", + "\n", + "try:\n", + " with psycopg.connect(connection_string) as conn:\n", + " conn.autocommit = True\n", + " with conn.cursor() as cur:\n", + " cur.execute(f\"CREATE DATABASE {llamaindex_db}\")\n", + " print(f\"Database {llamaindex_db} created.\")\n", + "except psycopg.errors.DuplicateDatabase:\n", + " print(f\"Database {llamaindex_db} already exists.\")" ] }, { @@ -255,12 +302,12 @@ "id": "c0232fd1" }, "source": [ - "### Create the indexes" + "### Create the tables\n" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "id": "8731da62", "metadata": { "colab": { @@ -275,65 +322,78 @@ "outputs": [], "source": [ "vector_store = PGVectorStore.from_params(\n", - " database=new_database,\n", + " database=llamaindex_db,\n", " host=host,\n", " password=password,\n", " port=port,\n", " user=user,\n", " embed_dim=1536,\n", + " schema_name=llamaindex_schema,\n", ")\n", "\n", - "document_store = PostgresDocumentStore.from_params( \n", - " database=new_database,\n", + "document_store = PostgresDocumentStore.from_params(\n", + " database=llamaindex_db,\n", " host=host,\n", " password=password,\n", " port=port,\n", " user=user,\n", + " schema_name=llamaindex_schema,\n", ")\n", "\n", "index_store = PostgresIndexStore.from_params(\n", - " database=new_database,\n", + " database=llamaindex_db,\n", " host=host,\n", " password=password,\n", " port=port,\n", " user=user,\n", + " schema_name=llamaindex_schema,\n", ")\n", "\n", "storage_context = StorageContext.from_defaults(\n", " docstore=document_store,\n", - " index_store=index_store, \n", - " vector_store=vector_store, \n", + " index_store=index_store,\n", + " vector_store=vector_store,\n", ")\n", "\n", "storage_context.docstore.add_documents(nodes)\n", "\n", "index = VectorStoreIndex(nodes, storage_context=storage_context)\n", "\n", - "retriever = index.as_retriever(similarity_top_k=5)\n", - "\n" + "retriever = index.as_retriever(similarity_top_k=5)" + ] + }, + { + "cell_type": "markdown", + "id": "4c4c592a", + "metadata": {}, + "source": [ + "### Create the index" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "9180ded8", "metadata": {}, "outputs": [], "source": [ - "conn_string_new_db = (\n", - " f\"dbname={new_database} \"\n", + "connection_string = (\n", + " f\"dbname={llamaindex_db} \"\n", " f\"user={user} \"\n", " f\"password={password} \"\n", " f\"host={host} \"\n", " f\"port={port}\"\n", ")\n", "\n", - "with psycopg.connect(conn_string_new_db) as conn:\n", + "schema = Identifier(llamaindex_schema)\n", + "query = SQL(\n", + " \"CREATE INDEX ON {}.data_llamaindex USING hnsw (embedding vector_cosine_ops)\"\n", + ").format(schema)\n", + "\n", + "with psycopg.connect(connection_string) as conn:\n", " conn.autocommit = True\n", " with conn.cursor() as cur:\n", - " cur.execute(\"\"\"\n", - " CREATE INDEX ON public.data_llamaindex USING hnsw (embedding vector_cosine_ops);\n", - " \"\"\")" + " cur.execute(query)" ] }, { @@ -343,7 +403,7 @@ "source": [ "### Testing\n", "\n", - "#### Generating a question from a random url" + "#### Generating a question from a random url\n" ] }, { @@ -381,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "id": "0a2bcc07", "metadata": { "id": "0a2bcc07" @@ -391,7 +451,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:08<00:00, 8.05s/it]\n" + "100%|██████████| 1/1 [00:02<00:00, 2.64s/it]\n" ] } ], @@ -400,14 +460,14 @@ "# random_url = random.choice(urls)\n", "random_url = urls[0]\n", "documents = SimpleWebPageReader(html_to_text=True).load_data([random_url])\n", - "assert len(documents)==1\n", + "assert len(documents) == 1\n", "extractor = QuestionsAnsweredExtractor(questions=1)\n", - "questions = await extractor.aextract(documents)\n" + "questions = await extractor.aextract(documents)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "id": "b280db13", "metadata": {}, "outputs": [ @@ -416,7 +476,7 @@ "output_type": "stream", "text": [ "url https://inspection.canada.ca/preventive-controls/sampling-procedures/eng/1518033335104/1528203403149\n", - "What are the steps and considerations involved in the sampling procedures for food safety according to the Canadian Food Inspection Agency?\n" + "What are the steps and considerations for collecting environmental samples for microbial testing in a food production setting, according to the Canadian Food Inspection Agency?\n" ] } ], @@ -431,23 +491,24 @@ "id": "2e173a78", "metadata": {}, "source": [ - "#### Checking if querying the index returns the right url" + "#### Checking if querying the index returns the right url\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 17, "id": "158034e3", "metadata": {}, "outputs": [], "source": [ "vector_store = PGVectorStore.from_params(\n", - " database=new_database,\n", + " database=llamaindex_db,\n", " host=host,\n", " password=password,\n", " port=port,\n", " user=user,\n", " embed_dim=1536,\n", + " schema_name=llamaindex_schema\n", ")\n", "\n", "index = VectorStoreIndex.from_vector_store(vector_store=vector_store)\n", @@ -456,23 +517,10 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 18, "id": "772f9ea7", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "start.............\n", - "get_agg_embedding_from_queries: 0.20 seconds\n", - "_build_vector_store_query: 0.00 seconds\n", - "_vector_store.query: 1.20 seconds\n", - "_retrieve: 1.39 seconds\n", - "_handle_recursive_retrieval: 0.00 seconds\n" - ] - } - ], + "outputs": [], "source": [ "# import time\n", "# start_time = time.time()\n", @@ -485,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 19, "id": "77c4ed49", "metadata": {}, "outputs": [ @@ -493,7 +541,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'node': {'id_': '4916a845-5358-42e8-93ed-be8b7beddf55', 'embedding': None, 'metadata': {'id': '4916a845-5358-42e8-93ed-be8b7beddf55', 'chunk_id': 'def3245a-e853-46f4-84e4-6b0f32a04983', 'url': 'https://inspection.canada.ca/inspection-and-enforcement/guidance-for-food-inspection-activities/sample-collection/as-required-food-sample-collection/eng/1653062252765/1653062253358', 'title': 'Operational procedure: As required food sample collection - Canadian Food Inspection Agency', 'subtitle': 'On this page;1.0 Purpose;2.0 Authorities', 'tokens_count': 267, 'last_updated': '2022-06-20', 'score': 0.5025328114227888}, 'excluded_embed_metadata_keys': [], 'excluded_llm_metadata_keys': [], 'relationships': {}, 'text': 'On this page 1.0 Purpose 2.0 Authorities 3.0 Reference documents 4.0 Definitions 5.0 Acronyms 6.0 Operational procedure 6.1 Prepare for the inspection 6.2 Conduct the inspection 6.3 Communicate the inspection results 6.4 Conduct the follow-up inspection 7.0 Appendix Annex B: DSDP data entry – CFIA sampled – sample results other than satisfactory (accessible only on the Government of Canada network – RDIMS 14996797)\\n1.0 Purpose The purpose of this document is to provide guidance to Canadian Food Inspection Agency (CFIA) inspection staff on the procedures for as required sample collection under the Food Business Line (FBL). This guidance is written with the assumption that inspection staff have been trained in the Standard Inspection Process (SIP) and the Digital Service Delivery Platform (DSDP). This document is intended to be used in conjunction with Operational guideline: Food sample collection.\\n2.0 Authorities Safe Food for Canadians Act (SFCA) Safe Food for Canadians Regulations (SFCR) Food and Drugs Act (FDA) Food and Drug Regulations (FDR) The inspection powers, control actions and enforcement actions authorized by the above legislation are identified and explained in the Operational guideline – Food regulatory response guidelines.', 'start_char_idx': None, 'end_char_idx': None, 'text_template': '{metadata_str}\\n\\n{content}', 'metadata_template': '{key}: {value}', 'metadata_seperator': '\\n', 'class_name': 'TextNode'}, 'score': 0.8909988999366824, 'class_name': 'NodeWithScore'}\n" + "{'node': {'id_': 'e456dca5-3079-4702-b89c-d469f951526f', 'embedding': None, 'metadata': {'id': '379a866f-3802-485e-afe3-85bb4c08e238', 'chunk_id': 'e456dca5-3079-4702-b89c-d469f951526f', 'url': 'https://inspection.canada.ca/inspection-and-enforcement/guidance-for-food-inspection-activities/sample-collection/food-sample-collection/eng/1540234969218/1540235089869', 'title': 'Operational guideline: Food sample collection - Canadian Food Inspection Agency', 'subtitle': 'On this page;1.0 Purpose;2.0 Authorities', 'tokens_count': 482, 'last_updated': '2023-03-24', 'score': 0.5859392646494657}, 'excluded_embed_metadata_keys': [], 'excluded_llm_metadata_keys': [], 'relationships': {}, 'text': 'On this page 1.0 Purpose 2.0 Authorities 3.0 Reference documents 4.0 Definitions 5.0 Acronyms 6.0 Operational guideline 6.1 Prepare for the inspection 6.2 Conduct the inspection 6.3 Communicate the inspection results 6.4 Conduct the follow-up inspection 7.0 Appendix Appendix 1: Aseptic sample collection Appendix 2: Core drilling for food samples Appendix 3: Environmental sample collection Appendix 4: Random sample collection Appendix 5: Water and ice sample collection Appendix 6: Types of analyses for food samples Appendix 7: Canadian Shellfish Sanitation Program (CSSP) sample collection Appendix 8: Dairy products sample collection Appendix 9: Fish and seafood sample collection Appendix 10: Fresh fruit and vegetable sample collection Appendix 11: Honey sample collection Appendix 12: Maple sample collection Appendix 13: Meat and poultry products sample collection Appendix 14: Processed egg product sample collection Appendix 15: Processed fruit and vegetable products sample collection Appendix 16: Shell egg sample collection Appendix 17: CFIA payment for food sample collection\\n1.0 Purpose The purpose of this document is to provide guidance to Canadian Food Inspection Agency (CFIA) inspection staff on the general guidelines for food sample collection. Sample collection is a task conducted under the Standard Inspection Process (SIP) and used to assess compliance of a food with relevant legislation and to gather baseline information on food products. This guidance supports inspectors to take samples that are representative of the food and the food production environment and applies to samples taken to support planned, official and as required food sample collection activities. This guidance is written with the assumption that inspection staff have reviewed the Food inspection guidance: sample collection page (accessible only on the Government of Canada network) and have been properly trained in sample collection techniques. This document is intended to be used in conjunction with other guidance documents as referenced in Section 3.0.\\n2.0 Authorities Food and Drugs Act (FDA) Food and Drug Regulations (FDR) Health of Animals Act (HAA) Health of Animals Regulations (HAR) Safe Food for Canadians Act (SFCA) Safe Food for Canadians Regulations (SFCR) The inspection powers, control actions and enforcement actions authorized by the above legislation are identified and explained in the Operational guideline – Food regulatory response guidelines.', 'start_char_idx': None, 'end_char_idx': None, 'text_template': '{metadata_str}\\n\\n{content}', 'metadata_template': '{key}: {value}', 'metadata_seperator': '\\n', 'class_name': 'TextNode'}, 'score': 0.8956051406625395, 'class_name': 'NodeWithScore'}\n" ] } ], @@ -503,7 +551,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 20, "id": "ec2a6918", "metadata": {}, "outputs": [ @@ -511,12 +559,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Right: https://inspection.canada.ca/preventive-controls/sampling-procedures/eng/1518033335104/1528203403149\n", - "Wrong: https://inspection.canada.ca/inspection-and-enforcement/guidance-for-food-inspection-activities/sample-collection/as-required-food-sample-collection/eng/1653062252765/1653062253358\n", - "Wrong: https://inspection.canada.ca/food-safety-for-industry/food-safety-rules-for-small-business/eng/1643050798737/1643050800221\n", - "Wrong: https://inspection.canada.ca/food-safety-for-industry/information-for-media/eng/1528746083978/1528746084227\n", - "Wrong: https://inspection.canada.ca/importing-food-plants-or-animals/food-imports/step-by-step-guide/eng/1523979839705/1523979840095\n", - "Wrong: https://inspection.canada.ca/inspection-and-enforcement/guidance-for-food-inspection-activities/sample-collection/eng/1589914459022/1589914459318\n" + "Position: 3 Sampling procedures - Canadian Food Inspection Agency\n" ] } ],