fix setting base_dir to full paths when not using file system. (#1096)

* fix setting base_dir to full paths when not using file system.

* add general resolve_path
This commit is contained in:
Derek Worthen 2024-09-04 11:33:44 -07:00 committed by GitHub
parent ab29cc2a7e
commit 2d45ece9b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 104 additions and 57 deletions

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "fix setting base_dir to full paths when not using file system."
}

View File

@ -68,7 +68,7 @@ from .models import (
UmapConfig,
)
from .read_dotenv import read_dotenv
from .resolve_timestamp_path import resolve_timestamp_path
from .resolve_path import resolve_path
__all__ = [
"ApiKeyMissingError",
@ -127,6 +127,6 @@ __all__ = [
"load_config",
"load_config_from_file",
"read_dotenv",
"resolve_timestamp_path",
"resolve_path",
"search_for_config_in_root_dir",
]

View File

@ -7,8 +7,9 @@ from pathlib import Path
from .config_file_loader import load_config_from_file, search_for_config_in_root_dir
from .create_graphrag_config import create_graphrag_config
from .enums import ReportingType, StorageType
from .models.graph_rag_config import GraphRagConfig
from .resolve_timestamp_path import resolve_timestamp_path
from .resolve_path import resolve_path
def load_config(
@ -47,19 +48,19 @@ def load_config(
else:
config = create_graphrag_config(root_dir=str(root))
if run_id:
config.storage.base_dir = str(
resolve_timestamp_path((root / config.storage.base_dir).resolve(), run_id)
config.storage.base_dir = str(
resolve_path(
config.storage.base_dir,
root if config.storage.type == StorageType.file else None,
run_id,
)
config.reporting.base_dir = str(
resolve_timestamp_path((root / config.reporting.base_dir).resolve(), run_id)
)
else:
config.storage.base_dir = str(
resolve_timestamp_path((root / config.storage.base_dir).resolve())
)
config.reporting.base_dir = str(
resolve_timestamp_path((root / config.reporting.base_dir).resolve())
)
config.reporting.base_dir = str(
resolve_path(
config.reporting.base_dir,
root if config.reporting.type == ReportingType.file else None,
run_id,
)
)
return config

View File

@ -79,9 +79,9 @@ def _resolve_timestamp_path_with_dir(
return _resolve_timestamp_path_with_value(path, timestamp_dirs[0].name)
def resolve_timestamp_path(
def _resolve_timestamp_path(
path: str | Path,
pattern_or_timestamp_value: re.Pattern[str] | str = re.compile(r"^\d{8}-\d{6}$"),
pattern_or_timestamp_value: re.Pattern[str] | str | None = None,
) -> Path:
r"""Timestamp path resolver.
@ -110,6 +110,43 @@ def resolve_timestamp_path(
If the parent directory expecting to contain timestamp directories does not exist or is not a directory.
Or if no timestamp directories are found in the parent directory that match the pattern.
"""
if not pattern_or_timestamp_value:
pattern_or_timestamp_value = re.compile(r"^\d{8}-\d{6}$")
if isinstance(pattern_or_timestamp_value, str):
return _resolve_timestamp_path_with_value(path, pattern_or_timestamp_value)
return _resolve_timestamp_path_with_dir(path, pattern_or_timestamp_value)
def resolve_path(
path_to_resolve: Path | str,
root_dir: Path | str | None = None,
pattern_or_timestamp_value: re.Pattern[str] | str | None = None,
) -> Path:
"""Resolve the path.
Resolves any timestamp variables by either using the provided timestamp value if string or
by looking up the latest available timestamp directory that matches the given pattern.
Resolves the path against the root directory if provided.
Parameters
----------
path_to_resolve : Path | str
The path to resolve.
root_dir : Path | str | None default=None
The root directory to resolve the path from, if provided.
pattern_or_timestamp_value : re.Pattern[str] | str, default=None
The pattern to use to match the timestamp directories or the timestamp value to use.
If a string is provided, the path will be resolved with the given string value.
Otherwise, the path will be resolved with the latest available timestamp directory
that matches the given pattern.
Returns
-------
Path
The resolved path.
"""
if root_dir:
path_to_resolve = (Path(root_dir) / path_to_resolve).resolve()
else:
path_to_resolve = Path(path_to_resolve)
return _resolve_timestamp_path(path_to_resolve, pattern_or_timestamp_value)

View File

@ -9,7 +9,7 @@ from pathlib import Path
import pandas as pd
from graphrag.config import load_config, resolve_timestamp_path
from graphrag.config import load_config, resolve_path
from graphrag.index.progress import PrintProgressReporter
from . import api
@ -34,9 +34,7 @@ def run_global_search(
config = load_config(root, config_filepath)
if data_dir:
config.storage.base_dir = str(
resolve_timestamp_path((root / data_dir).resolve())
)
config.storage.base_dir = str(resolve_path(data_dir, root))
data_path = Path(config.storage.base_dir).resolve()
@ -112,9 +110,7 @@ def run_local_search(
config = load_config(root, config_filepath)
if data_dir:
config.storage.base_dir = str(
resolve_timestamp_path((root / data_dir).resolve())
)
config.storage.base_dir = str(resolve_path(data_dir, root))
data_path = Path(config.storage.base_dir).resolve()

View File

@ -0,0 +1,42 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from pathlib import Path
from graphrag.config.resolve_path import resolve_path
def test_resolve_path_no_timestamp_with_run_id():
path = Path("path/to/data")
result = resolve_path(path, pattern_or_timestamp_value="20240812-121000")
assert result == path
def test_resolve_path_no_timestamp_without_run_id():
path = Path("path/to/data")
result = resolve_path(path)
assert result == path
def test_resolve_path_with_timestamp_and_run_id():
path = Path("some/path/${timestamp}/data")
expected = Path("some/path/20240812/data")
result = resolve_path(path, pattern_or_timestamp_value="20240812")
assert result == expected
def test_resolve_path_with_timestamp_and_inferred_directory():
cwd = Path(__file__).parent
path = cwd / "fixtures/timestamp_dirs/${timestamp}/data"
expected = cwd / "fixtures/timestamp_dirs/20240812-120000/data"
result = resolve_path(path)
assert result == expected
def test_resolve_path_absolute():
cwd = Path(__file__).parent
path = "fixtures/timestamp_dirs/${timestamp}/data"
expected = cwd / "fixtures/timestamp_dirs/20240812-120000/data"
result = resolve_path(path, cwd)
assert result == expected
assert result.is_absolute()

View File

@ -1,33 +0,0 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from pathlib import Path
from graphrag.config.resolve_timestamp_path import resolve_timestamp_path
def test_resolve_timestamp_path_no_timestamp_with_run_id():
path = Path("path/to/data")
result = resolve_timestamp_path(path, "20240812-121000")
assert result == path
def test_resolve_timestamp_path_no_timestamp_without_run_id():
path = Path("path/to/data")
result = resolve_timestamp_path(path)
assert result == path
def test_resolve_timestamp_path_with_timestamp_and_run_id():
path = Path("some/path/${timestamp}/data")
expected = Path("some/path/20240812/data")
result = resolve_timestamp_path(path, "20240812")
assert result == expected
def test_resolve_timestamp_path_with_timestamp_and_inferred_directory():
cwd = Path(__file__).parent
path = cwd / "fixtures/timestamp_dirs/${timestamp}/data"
expected = cwd / "fixtures/timestamp_dirs/20240812-120000/data"
result = resolve_timestamp_path(path)
assert result == expected