From 2d45ece9b65111fe8729b983f3d30f455afb4ab7 Mon Sep 17 00:00:00 2001 From: Derek Worthen Date: Wed, 4 Sep 2024 11:33:44 -0700 Subject: [PATCH] fix setting base_dir to full paths when not using file system. (#1096) * fix setting base_dir to full paths when not using file system. * add general resolve_path --- .../patch-20240904173227165702.json | 4 ++ graphrag/config/__init__.py | 4 +- graphrag/config/load_config.py | 27 ++++++------ ...olve_timestamp_path.py => resolve_path.py} | 41 +++++++++++++++++- graphrag/query/cli.py | 10 ++--- tests/unit/config/test_resolve_path.py | 42 +++++++++++++++++++ .../config/test_resolve_timestamp_path.py | 33 --------------- 7 files changed, 104 insertions(+), 57 deletions(-) create mode 100644 .semversioner/next-release/patch-20240904173227165702.json rename graphrag/config/{resolve_timestamp_path.py => resolve_path.py} (72%) create mode 100644 tests/unit/config/test_resolve_path.py delete mode 100644 tests/unit/config/test_resolve_timestamp_path.py diff --git a/.semversioner/next-release/patch-20240904173227165702.json b/.semversioner/next-release/patch-20240904173227165702.json new file mode 100644 index 00000000..4010ac4f --- /dev/null +++ b/.semversioner/next-release/patch-20240904173227165702.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "fix setting base_dir to full paths when not using file system." +} diff --git a/graphrag/config/__init__.py b/graphrag/config/__init__.py index c65795f0..fae2ba3a 100644 --- a/graphrag/config/__init__.py +++ b/graphrag/config/__init__.py @@ -68,7 +68,7 @@ from .models import ( UmapConfig, ) from .read_dotenv import read_dotenv -from .resolve_timestamp_path import resolve_timestamp_path +from .resolve_path import resolve_path __all__ = [ "ApiKeyMissingError", @@ -127,6 +127,6 @@ __all__ = [ "load_config", "load_config_from_file", "read_dotenv", - "resolve_timestamp_path", + "resolve_path", "search_for_config_in_root_dir", ] diff --git a/graphrag/config/load_config.py b/graphrag/config/load_config.py index 54c9271f..6c0f9fe5 100644 --- a/graphrag/config/load_config.py +++ b/graphrag/config/load_config.py @@ -7,8 +7,9 @@ from pathlib import Path from .config_file_loader import load_config_from_file, search_for_config_in_root_dir from .create_graphrag_config import create_graphrag_config +from .enums import ReportingType, StorageType from .models.graph_rag_config import GraphRagConfig -from .resolve_timestamp_path import resolve_timestamp_path +from .resolve_path import resolve_path def load_config( @@ -47,19 +48,19 @@ def load_config( else: config = create_graphrag_config(root_dir=str(root)) - if run_id: - config.storage.base_dir = str( - resolve_timestamp_path((root / config.storage.base_dir).resolve(), run_id) + config.storage.base_dir = str( + resolve_path( + config.storage.base_dir, + root if config.storage.type == StorageType.file else None, + run_id, ) - config.reporting.base_dir = str( - resolve_timestamp_path((root / config.reporting.base_dir).resolve(), run_id) - ) - else: - config.storage.base_dir = str( - resolve_timestamp_path((root / config.storage.base_dir).resolve()) - ) - config.reporting.base_dir = str( - resolve_timestamp_path((root / config.reporting.base_dir).resolve()) + ) + config.reporting.base_dir = str( + resolve_path( + config.reporting.base_dir, + root if config.reporting.type == ReportingType.file else None, + run_id, ) + ) return config diff --git a/graphrag/config/resolve_timestamp_path.py b/graphrag/config/resolve_path.py similarity index 72% rename from graphrag/config/resolve_timestamp_path.py rename to graphrag/config/resolve_path.py index 492f6201..bc4131b6 100644 --- a/graphrag/config/resolve_timestamp_path.py +++ b/graphrag/config/resolve_path.py @@ -79,9 +79,9 @@ def _resolve_timestamp_path_with_dir( return _resolve_timestamp_path_with_value(path, timestamp_dirs[0].name) -def resolve_timestamp_path( +def _resolve_timestamp_path( path: str | Path, - pattern_or_timestamp_value: re.Pattern[str] | str = re.compile(r"^\d{8}-\d{6}$"), + pattern_or_timestamp_value: re.Pattern[str] | str | None = None, ) -> Path: r"""Timestamp path resolver. @@ -110,6 +110,43 @@ def resolve_timestamp_path( If the parent directory expecting to contain timestamp directories does not exist or is not a directory. Or if no timestamp directories are found in the parent directory that match the pattern. """ + if not pattern_or_timestamp_value: + pattern_or_timestamp_value = re.compile(r"^\d{8}-\d{6}$") if isinstance(pattern_or_timestamp_value, str): return _resolve_timestamp_path_with_value(path, pattern_or_timestamp_value) return _resolve_timestamp_path_with_dir(path, pattern_or_timestamp_value) + + +def resolve_path( + path_to_resolve: Path | str, + root_dir: Path | str | None = None, + pattern_or_timestamp_value: re.Pattern[str] | str | None = None, +) -> Path: + """Resolve the path. + + Resolves any timestamp variables by either using the provided timestamp value if string or + by looking up the latest available timestamp directory that matches the given pattern. + Resolves the path against the root directory if provided. + + Parameters + ---------- + path_to_resolve : Path | str + The path to resolve. + root_dir : Path | str | None default=None + The root directory to resolve the path from, if provided. + pattern_or_timestamp_value : re.Pattern[str] | str, default=None + The pattern to use to match the timestamp directories or the timestamp value to use. + If a string is provided, the path will be resolved with the given string value. + Otherwise, the path will be resolved with the latest available timestamp directory + that matches the given pattern. + + Returns + ------- + Path + The resolved path. + """ + if root_dir: + path_to_resolve = (Path(root_dir) / path_to_resolve).resolve() + else: + path_to_resolve = Path(path_to_resolve) + return _resolve_timestamp_path(path_to_resolve, pattern_or_timestamp_value) diff --git a/graphrag/query/cli.py b/graphrag/query/cli.py index f3847c0c..3ac76f81 100644 --- a/graphrag/query/cli.py +++ b/graphrag/query/cli.py @@ -9,7 +9,7 @@ from pathlib import Path import pandas as pd -from graphrag.config import load_config, resolve_timestamp_path +from graphrag.config import load_config, resolve_path from graphrag.index.progress import PrintProgressReporter from . import api @@ -34,9 +34,7 @@ def run_global_search( config = load_config(root, config_filepath) if data_dir: - config.storage.base_dir = str( - resolve_timestamp_path((root / data_dir).resolve()) - ) + config.storage.base_dir = str(resolve_path(data_dir, root)) data_path = Path(config.storage.base_dir).resolve() @@ -112,9 +110,7 @@ def run_local_search( config = load_config(root, config_filepath) if data_dir: - config.storage.base_dir = str( - resolve_timestamp_path((root / data_dir).resolve()) - ) + config.storage.base_dir = str(resolve_path(data_dir, root)) data_path = Path(config.storage.base_dir).resolve() diff --git a/tests/unit/config/test_resolve_path.py b/tests/unit/config/test_resolve_path.py new file mode 100644 index 00000000..bd82d282 --- /dev/null +++ b/tests/unit/config/test_resolve_path.py @@ -0,0 +1,42 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +from pathlib import Path + +from graphrag.config.resolve_path import resolve_path + + +def test_resolve_path_no_timestamp_with_run_id(): + path = Path("path/to/data") + result = resolve_path(path, pattern_or_timestamp_value="20240812-121000") + assert result == path + + +def test_resolve_path_no_timestamp_without_run_id(): + path = Path("path/to/data") + result = resolve_path(path) + assert result == path + + +def test_resolve_path_with_timestamp_and_run_id(): + path = Path("some/path/${timestamp}/data") + expected = Path("some/path/20240812/data") + result = resolve_path(path, pattern_or_timestamp_value="20240812") + assert result == expected + + +def test_resolve_path_with_timestamp_and_inferred_directory(): + cwd = Path(__file__).parent + path = cwd / "fixtures/timestamp_dirs/${timestamp}/data" + expected = cwd / "fixtures/timestamp_dirs/20240812-120000/data" + result = resolve_path(path) + assert result == expected + + +def test_resolve_path_absolute(): + cwd = Path(__file__).parent + path = "fixtures/timestamp_dirs/${timestamp}/data" + expected = cwd / "fixtures/timestamp_dirs/20240812-120000/data" + result = resolve_path(path, cwd) + assert result == expected + assert result.is_absolute() diff --git a/tests/unit/config/test_resolve_timestamp_path.py b/tests/unit/config/test_resolve_timestamp_path.py deleted file mode 100644 index 0af0f11a..00000000 --- a/tests/unit/config/test_resolve_timestamp_path.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -from pathlib import Path - -from graphrag.config.resolve_timestamp_path import resolve_timestamp_path - - -def test_resolve_timestamp_path_no_timestamp_with_run_id(): - path = Path("path/to/data") - result = resolve_timestamp_path(path, "20240812-121000") - assert result == path - - -def test_resolve_timestamp_path_no_timestamp_without_run_id(): - path = Path("path/to/data") - result = resolve_timestamp_path(path) - assert result == path - - -def test_resolve_timestamp_path_with_timestamp_and_run_id(): - path = Path("some/path/${timestamp}/data") - expected = Path("some/path/20240812/data") - result = resolve_timestamp_path(path, "20240812") - assert result == expected - - -def test_resolve_timestamp_path_with_timestamp_and_inferred_directory(): - cwd = Path(__file__).parent - path = cwd / "fixtures/timestamp_dirs/${timestamp}/data" - expected = cwd / "fixtures/timestamp_dirs/20240812-120000/data" - result = resolve_timestamp_path(path) - assert result == expected