mirror of
https://github.com/microsoft/graphrag.git
synced 2025-08-15 12:11:44 +00:00
fix setting base_dir to full paths when not using file system. (#1096)
* fix setting base_dir to full paths when not using file system. * add general resolve_path
This commit is contained in:
parent
ab29cc2a7e
commit
2d45ece9b6
@ -0,0 +1,4 @@
|
||||
{
|
||||
"type": "patch",
|
||||
"description": "fix setting base_dir to full paths when not using file system."
|
||||
}
|
@ -68,7 +68,7 @@ from .models import (
|
||||
UmapConfig,
|
||||
)
|
||||
from .read_dotenv import read_dotenv
|
||||
from .resolve_timestamp_path import resolve_timestamp_path
|
||||
from .resolve_path import resolve_path
|
||||
|
||||
__all__ = [
|
||||
"ApiKeyMissingError",
|
||||
@ -127,6 +127,6 @@ __all__ = [
|
||||
"load_config",
|
||||
"load_config_from_file",
|
||||
"read_dotenv",
|
||||
"resolve_timestamp_path",
|
||||
"resolve_path",
|
||||
"search_for_config_in_root_dir",
|
||||
]
|
||||
|
@ -7,8 +7,9 @@ from pathlib import Path
|
||||
|
||||
from .config_file_loader import load_config_from_file, search_for_config_in_root_dir
|
||||
from .create_graphrag_config import create_graphrag_config
|
||||
from .enums import ReportingType, StorageType
|
||||
from .models.graph_rag_config import GraphRagConfig
|
||||
from .resolve_timestamp_path import resolve_timestamp_path
|
||||
from .resolve_path import resolve_path
|
||||
|
||||
|
||||
def load_config(
|
||||
@ -47,19 +48,19 @@ def load_config(
|
||||
else:
|
||||
config = create_graphrag_config(root_dir=str(root))
|
||||
|
||||
if run_id:
|
||||
config.storage.base_dir = str(
|
||||
resolve_timestamp_path((root / config.storage.base_dir).resolve(), run_id)
|
||||
config.storage.base_dir = str(
|
||||
resolve_path(
|
||||
config.storage.base_dir,
|
||||
root if config.storage.type == StorageType.file else None,
|
||||
run_id,
|
||||
)
|
||||
config.reporting.base_dir = str(
|
||||
resolve_timestamp_path((root / config.reporting.base_dir).resolve(), run_id)
|
||||
)
|
||||
else:
|
||||
config.storage.base_dir = str(
|
||||
resolve_timestamp_path((root / config.storage.base_dir).resolve())
|
||||
)
|
||||
config.reporting.base_dir = str(
|
||||
resolve_timestamp_path((root / config.reporting.base_dir).resolve())
|
||||
)
|
||||
config.reporting.base_dir = str(
|
||||
resolve_path(
|
||||
config.reporting.base_dir,
|
||||
root if config.reporting.type == ReportingType.file else None,
|
||||
run_id,
|
||||
)
|
||||
)
|
||||
|
||||
return config
|
||||
|
@ -79,9 +79,9 @@ def _resolve_timestamp_path_with_dir(
|
||||
return _resolve_timestamp_path_with_value(path, timestamp_dirs[0].name)
|
||||
|
||||
|
||||
def resolve_timestamp_path(
|
||||
def _resolve_timestamp_path(
|
||||
path: str | Path,
|
||||
pattern_or_timestamp_value: re.Pattern[str] | str = re.compile(r"^\d{8}-\d{6}$"),
|
||||
pattern_or_timestamp_value: re.Pattern[str] | str | None = None,
|
||||
) -> Path:
|
||||
r"""Timestamp path resolver.
|
||||
|
||||
@ -110,6 +110,43 @@ def resolve_timestamp_path(
|
||||
If the parent directory expecting to contain timestamp directories does not exist or is not a directory.
|
||||
Or if no timestamp directories are found in the parent directory that match the pattern.
|
||||
"""
|
||||
if not pattern_or_timestamp_value:
|
||||
pattern_or_timestamp_value = re.compile(r"^\d{8}-\d{6}$")
|
||||
if isinstance(pattern_or_timestamp_value, str):
|
||||
return _resolve_timestamp_path_with_value(path, pattern_or_timestamp_value)
|
||||
return _resolve_timestamp_path_with_dir(path, pattern_or_timestamp_value)
|
||||
|
||||
|
||||
def resolve_path(
|
||||
path_to_resolve: Path | str,
|
||||
root_dir: Path | str | None = None,
|
||||
pattern_or_timestamp_value: re.Pattern[str] | str | None = None,
|
||||
) -> Path:
|
||||
"""Resolve the path.
|
||||
|
||||
Resolves any timestamp variables by either using the provided timestamp value if string or
|
||||
by looking up the latest available timestamp directory that matches the given pattern.
|
||||
Resolves the path against the root directory if provided.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_to_resolve : Path | str
|
||||
The path to resolve.
|
||||
root_dir : Path | str | None default=None
|
||||
The root directory to resolve the path from, if provided.
|
||||
pattern_or_timestamp_value : re.Pattern[str] | str, default=None
|
||||
The pattern to use to match the timestamp directories or the timestamp value to use.
|
||||
If a string is provided, the path will be resolved with the given string value.
|
||||
Otherwise, the path will be resolved with the latest available timestamp directory
|
||||
that matches the given pattern.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path
|
||||
The resolved path.
|
||||
"""
|
||||
if root_dir:
|
||||
path_to_resolve = (Path(root_dir) / path_to_resolve).resolve()
|
||||
else:
|
||||
path_to_resolve = Path(path_to_resolve)
|
||||
return _resolve_timestamp_path(path_to_resolve, pattern_or_timestamp_value)
|
@ -9,7 +9,7 @@ from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from graphrag.config import load_config, resolve_timestamp_path
|
||||
from graphrag.config import load_config, resolve_path
|
||||
from graphrag.index.progress import PrintProgressReporter
|
||||
|
||||
from . import api
|
||||
@ -34,9 +34,7 @@ def run_global_search(
|
||||
config = load_config(root, config_filepath)
|
||||
|
||||
if data_dir:
|
||||
config.storage.base_dir = str(
|
||||
resolve_timestamp_path((root / data_dir).resolve())
|
||||
)
|
||||
config.storage.base_dir = str(resolve_path(data_dir, root))
|
||||
|
||||
data_path = Path(config.storage.base_dir).resolve()
|
||||
|
||||
@ -112,9 +110,7 @@ def run_local_search(
|
||||
config = load_config(root, config_filepath)
|
||||
|
||||
if data_dir:
|
||||
config.storage.base_dir = str(
|
||||
resolve_timestamp_path((root / data_dir).resolve())
|
||||
)
|
||||
config.storage.base_dir = str(resolve_path(data_dir, root))
|
||||
|
||||
data_path = Path(config.storage.base_dir).resolve()
|
||||
|
||||
|
42
tests/unit/config/test_resolve_path.py
Normal file
42
tests/unit/config/test_resolve_path.py
Normal file
@ -0,0 +1,42 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from graphrag.config.resolve_path import resolve_path
|
||||
|
||||
|
||||
def test_resolve_path_no_timestamp_with_run_id():
|
||||
path = Path("path/to/data")
|
||||
result = resolve_path(path, pattern_or_timestamp_value="20240812-121000")
|
||||
assert result == path
|
||||
|
||||
|
||||
def test_resolve_path_no_timestamp_without_run_id():
|
||||
path = Path("path/to/data")
|
||||
result = resolve_path(path)
|
||||
assert result == path
|
||||
|
||||
|
||||
def test_resolve_path_with_timestamp_and_run_id():
|
||||
path = Path("some/path/${timestamp}/data")
|
||||
expected = Path("some/path/20240812/data")
|
||||
result = resolve_path(path, pattern_or_timestamp_value="20240812")
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_resolve_path_with_timestamp_and_inferred_directory():
|
||||
cwd = Path(__file__).parent
|
||||
path = cwd / "fixtures/timestamp_dirs/${timestamp}/data"
|
||||
expected = cwd / "fixtures/timestamp_dirs/20240812-120000/data"
|
||||
result = resolve_path(path)
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_resolve_path_absolute():
|
||||
cwd = Path(__file__).parent
|
||||
path = "fixtures/timestamp_dirs/${timestamp}/data"
|
||||
expected = cwd / "fixtures/timestamp_dirs/20240812-120000/data"
|
||||
result = resolve_path(path, cwd)
|
||||
assert result == expected
|
||||
assert result.is_absolute()
|
@ -1,33 +0,0 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from graphrag.config.resolve_timestamp_path import resolve_timestamp_path
|
||||
|
||||
|
||||
def test_resolve_timestamp_path_no_timestamp_with_run_id():
|
||||
path = Path("path/to/data")
|
||||
result = resolve_timestamp_path(path, "20240812-121000")
|
||||
assert result == path
|
||||
|
||||
|
||||
def test_resolve_timestamp_path_no_timestamp_without_run_id():
|
||||
path = Path("path/to/data")
|
||||
result = resolve_timestamp_path(path)
|
||||
assert result == path
|
||||
|
||||
|
||||
def test_resolve_timestamp_path_with_timestamp_and_run_id():
|
||||
path = Path("some/path/${timestamp}/data")
|
||||
expected = Path("some/path/20240812/data")
|
||||
result = resolve_timestamp_path(path, "20240812")
|
||||
assert result == expected
|
||||
|
||||
|
||||
def test_resolve_timestamp_path_with_timestamp_and_inferred_directory():
|
||||
cwd = Path(__file__).parent
|
||||
path = cwd / "fixtures/timestamp_dirs/${timestamp}/data"
|
||||
expected = cwd / "fixtures/timestamp_dirs/20240812-120000/data"
|
||||
result = resolve_timestamp_path(path)
|
||||
assert result == expected
|
Loading…
x
Reference in New Issue
Block a user