Add stricter filtering and tests for cli data directory discovery (#910)

* Add stricter filtering and tests for cli data directory discovery

* Semver

* Ignore ruff on error type

* Format

* Fix for windows paths

* Fix for windows paths

* Uncomment blob tests

* Sort by timestamp name instead of modified date

* Format

* Add additional folder name test
This commit is contained in:
Nathan Evans 2024-08-13 16:34:14 -07:00 committed by GitHub
parent d68e323193
commit ac504e31a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 42 additions and 2 deletions

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Improves filtering for data dir inferring"
}

View File

@ -4,7 +4,7 @@
"""Command line interface for the query module."""
import asyncio
import os
import re
from pathlib import Path
from typing import cast
@ -129,7 +129,9 @@ def _infer_data_dir(root: str) -> str:
output = Path(root) / "output"
# use the latest data-run folder
if output.exists():
folders = sorted(output.iterdir(), key=os.path.getmtime, reverse=True)
expr = re.compile(r"\d{8}-\d{6}")
filtered = [f for f in output.iterdir() if f.is_dir() and expr.match(f.name)]
folders = sorted(filtered, key=lambda f: f.name, reverse=True)
if len(folders) > 0:
folder = folders[0]
return str((folder / "artifacts").absolute())

View File

@ -0,0 +1,2 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

View File

@ -0,0 +1,32 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from pathlib import Path
import pytest
from graphrag.query.cli import _infer_data_dir
def test_infer_data_dir():
root = "./tests/unit/query/data/defaults"
result = Path(_infer_data_dir(root))
assert result.parts[-2] == "20240812-121000"
def test_infer_data_dir_ignores_hidden_files():
"""A hidden file, starting with '.', will naturally be selected as latest data directory."""
root = "./tests/unit/query/data/hidden"
result = Path(_infer_data_dir(root))
assert result.parts[-2] == "20240812-121000"
def test_infer_data_dir_ignores_non_numeric():
root = "./tests/unit/query/data/non-numeric"
result = Path(_infer_data_dir(root))
assert result.parts[-2] == "20240812-121000"
def test_infer_data_dir_throws_on_no_match():
root = "./tests/unit/query/data/empty"
with pytest.raises(ValueError): # noqa PT011 (this is what is actually thrown...)
_infer_data_dir(root)