{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Copyright (c) 2024 Microsoft Corporation.\n", "# Licensed under the MIT License." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Entity df columns: Index(['id', 'human_readable_id', 'title', 'community', 'level', 'degree', 'x',\n", " 'y'],\n", " dtype='object')\n", "Entity count: 888\n", "Relationship count: 812\n", "Text unit records: 38\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idhuman_readable_idtextn_tokensdocument_idsentity_idsrelationship_idscovariate_ids
0aa55265004ced76e9050ed4b7a45c0496e10faa0eddb8a...1../\\nJACOB COLLIER: Honestly, I think mastery...1200[1e0886ae010728d10b2972f66b88608dc82b8645d3085...[9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6...[9af066c8-031b-4c52-b93b-b37763f6f0f7, 5b15580...[f91209d1-0939-452e-b51b-be1763e2a27d, f2274c3...
17f0fb1d3bf517dc76dffa984eec7a25e851e44ead0df82...2OMBERG: I grew up and started getting into al...1200[1e0886ae010728d10b2972f66b88608dc82b8645d3085...[9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6...[c2ac3612-3aaf-440c-babd-e21f474e0366, 9aab0b4...[13c74c18-439b-4419-8427-4ba826503055, 7120179...
227b739ceeddfa100f7be3cf002fd3a27aea2228f1a02c4...3, you know, and it’s a very linear pathway too...1200[1e0886ae010728d10b2972f66b88608dc82b8645d3085...[9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6...[9af066c8-031b-4c52-b93b-b37763f6f0f7, 5456bcb...[a67f1c21-32ab-4eaa-b063-c815e7f3ea9d]
3d97017305e234cc51554d653447d73b58441e1ff0f99e4...4you know, we started taking a lot of parts an...1200[1e0886ae010728d10b2972f66b88608dc82b8645d3085...[9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6...[bac3dd9b-f5c8-4966-9dc3-87f1f5976e36, 4f41be8...[695ac017-7c10-44ad-a681-3b4c1ae86a87]
42b6d29f8a74b16ea9a70423bce803a08a5b9ed4e6a946b...5only so much processing that my mind can do i...1200[1e0886ae010728d10b2972f66b88608dc82b8645d3085...[9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6...[d225ff7b-ca47-4fab-8d9a-4f86111526f8, 3fddb40...[d8a7ad5c-a170-430e-9f9d-902047371ee2]
\n", "
" ], "text/plain": [ " id human_readable_id \\\n", "0 aa55265004ced76e9050ed4b7a45c0496e10faa0eddb8a... 1 \n", "1 7f0fb1d3bf517dc76dffa984eec7a25e851e44ead0df82... 2 \n", "2 27b739ceeddfa100f7be3cf002fd3a27aea2228f1a02c4... 3 \n", "3 d97017305e234cc51554d653447d73b58441e1ff0f99e4... 4 \n", "4 2b6d29f8a74b16ea9a70423bce803a08a5b9ed4e6a946b... 5 \n", "\n", " text n_tokens \\\n", "0 ../\\nJACOB COLLIER: Honestly, I think mastery... 1200 \n", "1 OMBERG: I grew up and started getting into al... 1200 \n", "2 , you know, and it’s a very linear pathway too... 1200 \n", "3 you know, we started taking a lot of parts an... 1200 \n", "4 only so much processing that my mind can do i... 1200 \n", "\n", " document_ids \\\n", "0 [1e0886ae010728d10b2972f66b88608dc82b8645d3085... \n", "1 [1e0886ae010728d10b2972f66b88608dc82b8645d3085... \n", "2 [1e0886ae010728d10b2972f66b88608dc82b8645d3085... \n", "3 [1e0886ae010728d10b2972f66b88608dc82b8645d3085... \n", "4 [1e0886ae010728d10b2972f66b88608dc82b8645d3085... \n", "\n", " entity_ids \\\n", "0 [9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6... \n", "1 [9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6... \n", "2 [9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6... \n", "3 [9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6... \n", "4 [9a062709-56dd-4bf2-8b41-926124b7a6f7, f8c54a6... \n", "\n", " relationship_ids \\\n", "0 [9af066c8-031b-4c52-b93b-b37763f6f0f7, 5b15580... \n", "1 [c2ac3612-3aaf-440c-babd-e21f474e0366, 9aab0b4... \n", "2 [9af066c8-031b-4c52-b93b-b37763f6f0f7, 5456bcb... \n", "3 [bac3dd9b-f5c8-4966-9dc3-87f1f5976e36, 4f41be8... \n", "4 [d225ff7b-ca47-4fab-8d9a-4f86111526f8, 3fddb40... \n", "\n", " covariate_ids \n", "0 [f91209d1-0939-452e-b51b-be1763e2a27d, f2274c3... \n", "1 [13c74c18-439b-4419-8427-4ba826503055, 7120179... \n", "2 [a67f1c21-32ab-4eaa-b063-c815e7f3ea9d] \n", "3 [695ac017-7c10-44ad-a681-3b4c1ae86a87] \n", "4 [d8a7ad5c-a170-430e-9f9d-902047371ee2] " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "from pathlib import Path\n", "\n", "import pandas as pd\n", "import tiktoken\n", "\n", "from graphrag.config.models.drift_search_config import DRIFTSearchConfig\n", "from graphrag.query.indexer_adapters import (\n", " read_indexer_entities,\n", " read_indexer_relationships,\n", " read_indexer_report_embeddings,\n", " read_indexer_reports,\n", " read_indexer_text_units,\n", ")\n", "from graphrag.query.llm.oai.chat_openai import ChatOpenAI\n", "from graphrag.query.llm.oai.embedding import OpenAIEmbedding\n", "from graphrag.query.llm.oai.typing import OpenaiApiType\n", "from graphrag.query.structured_search.drift_search.drift_context import (\n", " DRIFTSearchContextBuilder,\n", ")\n", "from graphrag.query.structured_search.drift_search.search import DRIFTSearch\n", "from graphrag.vector_stores.lancedb import LanceDBVectorStore\n", "\n", "INPUT_DIR = \"./inputs/operation dulce\"\n", "LANCEDB_URI = f\"{INPUT_DIR}/lancedb\"\n", "\n", "COMMUNITY_REPORT_TABLE = \"community_reports\"\n", "COMMUNITY_TABLE = \"communities\"\n", "ENTITY_TABLE = \"entities\"\n", "RELATIONSHIP_TABLE = \"relationships\"\n", "COVARIATE_TABLE = \"covariates\"\n", "TEXT_UNIT_TABLE = \"text_units\"\n", "COMMUNITY_LEVEL = 2\n", "\n", "\n", "# read nodes table to get community and degree data\n", "entity_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_TABLE}.parquet\")\n", "community_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet\")\n", "\n", "print(f\"Entity df columns: {entity_df.columns}\")\n", "\n", "entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)\n", "\n", "# load description embeddings to an in-memory lancedb vectorstore\n", "# to connect to a remote db, specify url and port values.\n", "description_embedding_store = LanceDBVectorStore(\n", " collection_name=\"default-entity-description\",\n", ")\n", "description_embedding_store.connect(db_uri=LANCEDB_URI)\n", "\n", "full_content_embedding_store = LanceDBVectorStore(\n", " collection_name=\"default-community-full_content\",\n", ")\n", "full_content_embedding_store.connect(db_uri=LANCEDB_URI)\n", "\n", "print(f\"Entity count: {len(entity_df)}\")\n", "entity_df.head()\n", "\n", "relationship_df = pd.read_parquet(f\"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet\")\n", "relationships = read_indexer_relationships(relationship_df)\n", "\n", "print(f\"Relationship count: {len(relationship_df)}\")\n", "relationship_df.head()\n", "\n", "text_unit_df = pd.read_parquet(f\"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet\")\n", "text_units = read_indexer_text_units(text_unit_df)\n", "\n", "print(f\"Text unit records: {len(text_unit_df)}\")\n", "text_unit_df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "api_key = os.environ[\"GRAPHRAG_API_KEY\"]\n", "llm_model = os.environ[\"GRAPHRAG_LLM_MODEL\"]\n", "embedding_model = os.environ[\"GRAPHRAG_EMBEDDING_MODEL\"]\n", "\n", "chat_llm = ChatOpenAI(\n", " api_key=api_key,\n", " model=llm_model,\n", " api_type=OpenaiApiType.OpenAI, # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI\n", " max_retries=20,\n", ")\n", "\n", "token_encoder = tiktoken.encoding_for_model(llm_model)\n", "\n", "text_embedder = OpenAIEmbedding(\n", " api_key=api_key,\n", " api_base=None,\n", " api_type=OpenaiApiType.OpenAI,\n", " model=embedding_model,\n", " deployment_name=embedding_model,\n", " max_retries=20,\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def read_community_reports(\n", " input_dir: str,\n", " community_report_table: str = COMMUNITY_REPORT_TABLE,\n", "):\n", " \"\"\"Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path.\"\"\"\n", " input_path = Path(input_dir) / f\"{community_report_table}.parquet\"\n", " return pd.read_parquet(input_path)\n", "\n", "\n", "report_df = read_community_reports(INPUT_DIR)\n", "reports = read_indexer_reports(\n", " report_df,\n", " community_df,\n", " COMMUNITY_LEVEL,\n", " content_embedding_col=\"full_content_embeddings\",\n", ")\n", "read_indexer_report_embeddings(reports, full_content_embedding_store)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "drift_params = DRIFTSearchConfig(\n", " temperature=0,\n", " max_tokens=12_000,\n", " primer_folds=1,\n", " drift_k_followups=3,\n", " n_depth=3,\n", " n=1,\n", ")\n", "\n", "context_builder = DRIFTSearchContextBuilder(\n", " chat_llm=chat_llm,\n", " text_embedder=text_embedder,\n", " entities=entities,\n", " relationships=relationships,\n", " reports=reports,\n", " entity_text_embeddings=description_embedding_store,\n", " text_units=text_units,\n", " token_encoder=token_encoder,\n", " config=drift_params,\n", ")\n", "\n", "search = DRIFTSearch(\n", " llm=chat_llm, context_builder=context_builder, token_encoder=token_encoder\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/alonsog/.cache/pypoetry/virtualenvs/graphrag-ta_-cxM1-py3.10/lib/python3.10/site-packages/numpy/core/fromnumeric.py:59: FutureWarning: 'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.\n", " return bound(*args, **kwds)\n", " 40%|████ | 8/20 [01:12<02:55, 14.65s/it] Failed to parse search response: {\n", " \"response\": \"# Mercer's Background and Mentorship Style\\n\\n## Background and Experience\\n\\nAgent Alex Mercer is depicted as a seasoned and determined member of the Paranormal Military Squad, tasked with the critical mission of Operation: Dulce. His background is characterized by a blend of military discipline and a deep-seated curiosity for the unknown. This duality is evident in his interactions and decision-making processes throughout the mission.\\n\\nMercer's experience in the field has honed his ability to balance protocol with intuition. He is described as having a \" \n", " , \"unfailing determination\" \n", " \n", "Traceback (most recent call last):\n", " File \"/home/alonsog/workspace/graphrag-gh/graphrag/graphrag/query/structured_search/drift_search/action.py\", line 76, in asearch\n", " response = json.loads(search_result.response)\n", " File \"/usr/lib/python3.10/json/__init__.py\", line 346, in loads\n", " return _default_decoder.decode(s)\n", " File \"/usr/lib/python3.10/json/decoder.py\", line 337, in decode\n", " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", " File \"/usr/lib/python3.10/json/decoder.py\", line 353, in raw_decode\n", " obj, end = self.scan_once(s, idx)\n", "json.decoder.JSONDecodeError: Expecting ':' delimiter: line 4 column 602 (char 1218)\n", "No answer found for query: How does Mercer's background and experience influence his mentorship style?\n", "No follow-up actions found for response: {}\n", "Failed to parse search response: {\n", " \"response\": \"# Agent Mercer's Leadership Style Compared to Other Members of the Paranormal Military Squad\\n\\n## Agent Alex Mercer's Leadership Style\\n\\nAgent Alex Mercer is depicted as a leader who balances strategic command with a deep sense of curiosity and a drive for understanding. His leadership style is characterized by a blend of military discipline and intellectual engagement. Mercer is portrayed as someone who values the input and expertise of his team members, fostering a collaborative environment where each member's strengths are utilized effectively. For instance, he often seeks the insights of Dr. Jordan Hayes and Sam Rivera, recognizing their expertise in deciphering alien signals and technological anomalies [Data: Sources (20, 16, 18, 23, 24)]. This approach not only enhances the team's problem-solving capabilities but also builds a sense of mutual respect and trust among the squad.\\n\\nMercer's leadership is also marked by his ability to remain calm and composed under pressure. He is described as having a \" , \"steely gaze\" \n", "Traceback (most recent call last):\n", " File \"/home/alonsog/workspace/graphrag-gh/graphrag/graphrag/query/structured_search/drift_search/action.py\", line 76, in asearch\n", " response = json.loads(search_result.response)\n", " File \"/usr/lib/python3.10/json/__init__.py\", line 346, in loads\n", " return _default_decoder.decode(s)\n", " File \"/usr/lib/python3.10/json/decoder.py\", line 337, in decode\n", " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", " File \"/usr/lib/python3.10/json/decoder.py\", line 353, in raw_decode\n", " obj, end = self.scan_once(s, idx)\n", "json.decoder.JSONDecodeError: Expecting ':' delimiter: line 2 column 1655 (char 1656)\n", "No answer found for query: How does Agent Mercer's leadership style compare to that of other members of the Paranormal Military Squad?\n", "No follow-up actions found for response: {}\n", " 95%|█████████▌| 19/20 [01:19<00:02, 2.78s/it]Failed to parse search response: \n", "Traceback (most recent call last):\n", " File \"/home/alonsog/workspace/graphrag-gh/graphrag/graphrag/query/structured_search/drift_search/action.py\", line 76, in asearch\n", " response = json.loads(search_result.response)\n", " File \"/usr/lib/python3.10/json/__init__.py\", line 346, in loads\n", " return _default_decoder.decode(s)\n", " File \"/usr/lib/python3.10/json/decoder.py\", line 337, in decode\n", " obj, end = self.raw_decode(s, idx=_w(s, 0).end())\n", " File \"/usr/lib/python3.10/json/decoder.py\", line 355, in raw_decode\n", " raise JSONDecodeError(\"Expecting value\", s, err.value) from None\n", "json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)\n", "No answer found for query: What specific alien messages has Dr. Jordan Hayes decoded?\n", "No follow-up actions found for response: {}\n", "No follow-up actions for action: What specific alien messages has Dr. Jordan Hayes decoded?\n", "No follow-up actions for action: How does Agent Mercer's leadership style compare to that of other members of the Paranormal Military Squad?\n", "No follow-up actions for action: How does Mercer's background and experience influence his mentorship style?\n", " 0%| | 0/20 [00:00