{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "4b5690db12e34685", "metadata": { "ExecuteTime": { "end_time": "2025-01-09T03:40:58.307102Z", "start_time": "2025-01-09T03:40:51.935233Z" } }, "outputs": [], "source": [ "import os\n", "import logging\n", "import numpy as np\n", "from lightrag import LightRAG, QueryParam\n", "from lightrag.llm.openai import openai_complete_if_cache, openai_embed\n", "from lightrag.utils import EmbeddingFunc\n", "import nest_asyncio" ] }, { "cell_type": "markdown", "id": "dd17956ec322b361", "metadata": {}, "source": "#### split by character" }, { "cell_type": "code", "execution_count": 3, "id": "8c8ee7c061bf9159", "metadata": { "ExecuteTime": { "end_time": "2025-01-09T03:41:13.961167Z", "start_time": "2025-01-09T03:41:13.958357Z" } }, "outputs": [], "source": [ "nest_asyncio.apply()\n", "WORKING_DIR = \"../../llm_rag/paper_db/R000088_test1\"\n", "logging.basicConfig(format=\"%(levelname)s:%(message)s\", level=logging.INFO)\n", "if not os.path.exists(WORKING_DIR):\n", " os.mkdir(WORKING_DIR)\n", "API = os.environ.get(\"DOUBAO_API_KEY\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "a5009d16e0851dca", "metadata": { "ExecuteTime": { "end_time": "2025-01-09T03:41:16.862036Z", "start_time": "2025-01-09T03:41:16.859306Z" } }, "outputs": [], "source": [ "async def llm_model_func(\n", " prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs\n", ") -> str:\n", " return await openai_complete_if_cache(\n", " \"ep-20241218114828-2tlww\",\n", " prompt,\n", " system_prompt=system_prompt,\n", " history_messages=history_messages,\n", " api_key=API,\n", " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", " **kwargs,\n", " )\n", "\n", "\n", "async def embedding_func(texts: list[str]) -> np.ndarray:\n", " return await openai_embed(\n", " texts,\n", " model=\"ep-20241231173413-pgjmk\",\n", " api_key=API,\n", " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", " )" ] }, { "cell_type": "code", "execution_count": 5, "id": "397fcad24ce4d0ed", "metadata": { "ExecuteTime": { "end_time": "2025-01-09T03:41:24.950307Z", "start_time": "2025-01-09T03:41:24.940353Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:lightrag:Logger initialized for working directory: ../../llm_rag/paper_db/R000088_test1\n", "INFO:lightrag:Load KV llm_response_cache with 0 data\n", "INFO:lightrag:Load KV full_docs with 0 data\n", "INFO:lightrag:Load KV text_chunks with 0 data\n", "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_entities.json'} 0 data\n", "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_relationships.json'} 0 data\n", "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test1/vdb_chunks.json'} 0 data\n", "INFO:lightrag:Loaded document status storage with 0 records\n" ] } ], "source": [ "rag = LightRAG(\n", " working_dir=WORKING_DIR,\n", " llm_model_func=llm_model_func,\n", " embedding_func=EmbeddingFunc(\n", " embedding_dim=4096, max_token_size=8192, func=embedding_func\n", " ),\n", " chunk_token_size=512,\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "id": "1dc3603677f7484d", "metadata": { "ExecuteTime": { "end_time": "2025-01-09T03:41:37.947456Z", "start_time": "2025-01-09T03:41:37.941901Z" } }, "outputs": [], "source": [ "with open(\n", " \"../../llm_rag/example/R000088/auto/R000088_full_txt.md\", \"r\", encoding=\"utf-8\"\n", ") as f:\n", " content = f.read()\n", "\n", "\n", "async def embedding_func(texts: list[str]) -> np.ndarray:\n", " return await openai_embed(\n", " texts,\n", " model=\"ep-20241231173413-pgjmk\",\n", " api_key=API,\n", " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n", " )\n", "\n", "\n", "async def get_embedding_dim():\n", " test_text = [\"This is a test sentence.\"]\n", " embedding = await embedding_func(test_text)\n", " embedding_dim = embedding.shape[1]\n", " return embedding_dim" ] }, { "cell_type": "code", "execution_count": 7, "id": "6844202606acfbe5", "metadata": { "ExecuteTime": { "end_time": "2025-01-09T03:41:39.608541Z", "start_time": "2025-01-09T03:41:39.165057Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n" ] } ], "source": [ "embedding_dimension = await get_embedding_dim()" ] }, { "cell_type": "code", "execution_count": 8, "id": "d6273839d9681403", "metadata": { "ExecuteTime": { "end_time": "2025-01-09T03:44:34.295345Z", "start_time": "2025-01-09T03:41:48.324171Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:lightrag:Processing 1 new unique documents\n", "Processing batch 1: 0%| | 0/1 [00:00标签中,针对每个问题详细分析你的思考过程。然后在<回答>标签中给出所有问题的最终答案。\"\"\"" ] }, { "cell_type": "code", "execution_count": 10, "id": "7a6491385b050095", "metadata": { "ExecuteTime": { "end_time": "2025-01-09T03:45:40.829111Z", "start_time": "2025-01-09T03:45:13.530298Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:lightrag:Local query uses 5 entites, 12 relations, 3 text units\n", "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n", "INFO:lightrag:Global query uses 8 entites, 5 relations, 4 text units\n", "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "<分析>\n", "1. **该文献主要研究的问题是什么?**\n", " - 思考过程:通过浏览论文内容,查找作者明确阐述研究目的的部分。文中多处提及“Our study was performed to explore whether folic acid treatment was associated with cancer outcomes and all-cause mortality after extended follow-up”,表明作者旨在探究叶酸治疗与癌症结局及全因死亡率之间的关系,尤其是在经过长期随访后。\n", "2. **该文献采用什么方法进行分析?**\n", " - 思考过程:寻找描述研究方法和数据分析过程的段落。文中提到“Survival curves were constructed using the Kaplan-Meier method and differences in survival between groups were analyzed using the log-rank test. Estimates of hazard ratios (HRs) with 95% CIs were obtained by using Cox proportional hazards regression models stratified by trial”,可以看出作者使用了Kaplan-Meier法构建生存曲线、log-rank检验分析组间生存差异以及Cox比例风险回归模型估计风险比等方法。\n", "3. **该文献的主要结论是什么?**\n", " - 思考过程:定位到论文中总结结论的部分,如“Conclusion Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods”,可知作者得出叶酸加维生素$\\mathsf{B}_{12}$治疗与癌症结局和全因死亡率增加有关的结论。\n", "<回答>\n", "1. 该文献主要研究的问题是:叶酸治疗与癌症结局及全因死亡率之间的关系,尤其是在经过长期随访后,叶酸治疗是否与癌症结局和全因死亡率相关。\n", "2. 该文献采用的分析方法包括:使用Kaplan-Meier法构建生存曲线、log-rank检验分析组间生存差异、Cox比例风险回归模型估计风险比等。\n", "3. 该文献的主要结论是:在挪威没有叶酸强化食品的情况下,叶酸加维生素$\\mathsf{B}_{12}$治疗与缺血性心脏病患者的癌症结局和全因死亡率增加有关。\n", "\n", "**参考文献**\n", "- [VD] In2Norwegianhomocysteine-lowering trialsamongpatientswithischemicheart disease, there was a statistically nonsignificantincreaseincancerincidenceinthe groupsassignedtofolicacidtreatment.15,16 Our study was performed to explore whetherfolicacidtreatmentwasassociatedwithcanceroutcomesandall-cause mortality after extended follow-up.\n", "- [VD] Survivalcurveswereconstructedusing theKaplan-Meiermethodanddifferences insurvivalbetweengroupswereanalyzed usingthelog-ranktest.Estimatesofhazard ratios (HRs) with $95\\%$ CIs were obtainedbyusingCoxproportionalhazards regressionmodelsstratifiedbytrial.\n", "- [VD] Conclusion Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods.\n" ] } ], "source": [ "resp = rag.query(prompt1, param=QueryParam(mode=\"mix\", top_k=5))\n", "print(resp)" ] }, { "cell_type": "markdown", "id": "4e5bfad24cb721a8", "metadata": {}, "source": "#### split by character only" }, { "cell_type": "code", "execution_count": 11, "id": "44e2992dc95f8ce0", "metadata": { "ExecuteTime": { "end_time": "2025-01-09T03:47:40.988796Z", "start_time": "2025-01-09T03:47:40.982648Z" } }, "outputs": [], "source": [ "WORKING_DIR = \"../../llm_rag/paper_db/R000088_test2\"\n", "if not os.path.exists(WORKING_DIR):\n", " os.mkdir(WORKING_DIR)" ] }, { "cell_type": "code", "execution_count": 12, "id": "62c63385d2d973d5", "metadata": { "ExecuteTime": { "end_time": "2025-01-09T03:51:39.951329Z", "start_time": "2025-01-09T03:49:15.218976Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:lightrag:Logger initialized for working directory: ../../llm_rag/paper_db/R000088_test2\n", "INFO:lightrag:Load KV llm_response_cache with 0 data\n", "INFO:lightrag:Load KV full_docs with 0 data\n", "INFO:lightrag:Load KV text_chunks with 0 data\n", "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_entities.json'} 0 data\n", "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_relationships.json'} 0 data\n", "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../../llm_rag/paper_db/R000088_test2/vdb_chunks.json'} 0 data\n", "INFO:lightrag:Loaded document status storage with 0 records\n", "INFO:lightrag:Processing 1 new unique documents\n", "Processing batch 1: 0%| | 0/1 [00:00\n", "- **该文献主要研究的问题是什么?**\n", " - **思考过程**:通过浏览论文的标题、摘要、引言等部分,寻找关于研究目的和问题的描述。论文标题为“Cancer Incidence and Mortality After Treatment With Folic Acid and Vitamin B12”,摘要中的“Objective”部分明确指出研究目的是“To evaluate effects of treatment with B vitamins on cancer outcomes and all-cause mortality in 2 randomized controlled trials”。因此,可以确定该文献主要研究的问题是评估B族维生素治疗对两项随机对照试验中癌症结局和全因死亡率的影响。\n", "- **该文献采用什么方法进行分析?**\n", " - **思考过程**:在论文的“METHODS”部分详细描述了研究方法。文中提到这是一个对两项随机、双盲、安慰剂对照临床试验(Norwegian Vitamin [NORVIT] trial和Western Norway B Vitamin Intervention Trial [WENBIT])数据的联合分析,并进行了观察性的试验后随访。具体包括对参与者进行分组干预(不同剂量的叶酸、维生素B12、维生素B6或安慰剂),收集临床信息和血样,分析循环B族维生素、同型半胱氨酸和可替宁等指标,并进行基因分型等,还涉及到多种统计分析方法,如计算预期癌症发生率、构建生存曲线、进行Cox比例风险回归模型分析等。\n", "- **该文献的主要结论是什么?**\n", " - **思考过程**:在论文的“Results”和“Conclusion”部分寻找主要结论。研究结果表明,在治疗期间,接受叶酸加维生素B12治疗的参与者血清叶酸浓度显著增加,且在后续随访中,该组癌症发病率、癌症死亡率和全因死亡率均有所上升,主要是肺癌发病率增加,而维生素B6治疗未显示出显著影响。结论部分明确指出“Treatment with folic acid plus vitamin $\\mathsf{B}_{12}$ was associated with increased cancer outcomes and all-cause mortality in patients with ischemic heart disease in Norway, where there is no folic acid fortification of foods”。\n", "\n", "\n", "<回答>\n", "- **主要研究问题**:评估B族维生素治疗对两项随机对照试验中癌症结局和全因死亡率的影响。\n", "- **研究方法**:采用对两项随机、双盲、安慰剂对照临床试验(Norwegian Vitamin [NORVIT] trial和Western Norway B Vitamin Intervention Trial [WENBIT])数据的联合分析,并进行观察性的试验后随访,涉及分组干预、多种指标检测以及多种统计分析方法。\n", "- **主要结论**:在挪威(食品中未添加叶酸),对于缺血性心脏病患者,叶酸加维生素B12治疗与癌症结局和全因死亡率的增加有关,而维生素B6治疗未显示出显著影响。\n", "\n", "**参考文献**\n", "- [VD] Cancer Incidence and Mortality After Treatment With Folic Acid and Vitamin B12\n", "- [VD] METHODS Study Design, Participants, and Study Intervention\n", "- [VD] RESULTS\n", "- [VD] Conclusion\n", "- [VD] Objective To evaluate effects of treatment with B vitamins on cancer outcomes and all-cause mortality in 2 randomized controlled trials.\n" ] } ], "source": [ "resp = rag.query(prompt1, param=QueryParam(mode=\"mix\", top_k=5))\n", "print(resp)" ] }, { "cell_type": "code", "execution_count": null, "id": "7ba6fa79a2550d10", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }