Add stricter filtering and tests for cli data directory discovery (#910)

* Add stricter filtering and tests for cli data directory discovery

* Semver

* Ignore ruff on error type

* Format

* Fix for windows paths

* Fix for windows paths

* Uncomment blob tests

* Sort by timestamp name instead of modified date

* Format

* Add additional folder name test
This commit is contained in:
Nathan Evans 2024-08-13 16:34:14 -07:00 committed by GitHub
parent d68e323193
commit ac504e31a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 42 additions and 2 deletions

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Improves filtering for data dir inferring"
}

View File

@ -4,7 +4,7 @@
"""Command line interface for the query module.""" """Command line interface for the query module."""
import asyncio import asyncio
import os import re
from pathlib import Path from pathlib import Path
from typing import cast from typing import cast
@ -129,7 +129,9 @@ def _infer_data_dir(root: str) -> str:
output = Path(root) / "output" output = Path(root) / "output"
# use the latest data-run folder # use the latest data-run folder
if output.exists(): if output.exists():
folders = sorted(output.iterdir(), key=os.path.getmtime, reverse=True) expr = re.compile(r"\d{8}-\d{6}")
filtered = [f for f in output.iterdir() if f.is_dir() and expr.match(f.name)]
folders = sorted(filtered, key=lambda f: f.name, reverse=True)
if len(folders) > 0: if len(folders) > 0:
folder = folders[0] folder = folders[0]
return str((folder / "artifacts").absolute()) return str((folder / "artifacts").absolute())

View File

@ -0,0 +1,2 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

View File

@ -0,0 +1,32 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from pathlib import Path
import pytest
from graphrag.query.cli import _infer_data_dir
def test_infer_data_dir():
root = "./tests/unit/query/data/defaults"
result = Path(_infer_data_dir(root))
assert result.parts[-2] == "20240812-121000"
def test_infer_data_dir_ignores_hidden_files():
"""A hidden file, starting with '.', will naturally be selected as latest data directory."""
root = "./tests/unit/query/data/hidden"
result = Path(_infer_data_dir(root))
assert result.parts[-2] == "20240812-121000"
def test_infer_data_dir_ignores_non_numeric():
root = "./tests/unit/query/data/non-numeric"
result = Path(_infer_data_dir(root))
assert result.parts[-2] == "20240812-121000"
def test_infer_data_dir_throws_on_no_match():
root = "./tests/unit/query/data/empty"
with pytest.raises(ValueError): # noqa PT011 (this is what is actually thrown...)
_infer_data_dir(root)