From ac504e31a097de065b7dd41dc52a8b948321a89b Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Tue, 13 Aug 2024 16:34:14 -0700 Subject: [PATCH] Add stricter filtering and tests for cli data directory discovery (#910) * Add stricter filtering and tests for cli data directory discovery * Semver * Ignore ruff on error type * Format * Fix for windows paths * Fix for windows paths * Uncomment blob tests * Sort by timestamp name instead of modified date * Format * Add additional folder name test --- .../patch-20240812232353903544.json | 4 +++ graphrag/query/cli.py | 6 ++-- tests/unit/query/__init__.py | 2 ++ .../defaults/output/20240812-120000/empty.txt | 0 .../defaults/output/20240812-121000/empty.txt | 0 .../query/data/empty/something-else/empty.txt | 0 .../data/hidden/output/.another/empty.txt | 0 tests/unit/query/data/hidden/output/.hidden | 0 .../hidden/output/20240812-120000/empty.txt | 0 .../hidden/output/20240812-121000/empty.txt | 0 .../output/20240812-120000/empty.txt | 0 .../output/20240812-121000/empty.txt | 0 .../output/something-else/empty.txt | 0 tests/unit/query/test_infer_data_dir.py | 32 +++++++++++++++++++ 14 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 .semversioner/next-release/patch-20240812232353903544.json create mode 100644 tests/unit/query/__init__.py create mode 100644 tests/unit/query/data/defaults/output/20240812-120000/empty.txt create mode 100644 tests/unit/query/data/defaults/output/20240812-121000/empty.txt create mode 100644 tests/unit/query/data/empty/something-else/empty.txt create mode 100644 tests/unit/query/data/hidden/output/.another/empty.txt create mode 100644 tests/unit/query/data/hidden/output/.hidden create mode 100644 tests/unit/query/data/hidden/output/20240812-120000/empty.txt create mode 100644 tests/unit/query/data/hidden/output/20240812-121000/empty.txt create mode 100644 tests/unit/query/data/non-numeric/output/20240812-120000/empty.txt create mode 100644 tests/unit/query/data/non-numeric/output/20240812-121000/empty.txt create mode 100644 tests/unit/query/data/non-numeric/output/something-else/empty.txt create mode 100644 tests/unit/query/test_infer_data_dir.py diff --git a/.semversioner/next-release/patch-20240812232353903544.json b/.semversioner/next-release/patch-20240812232353903544.json new file mode 100644 index 00000000..733b4062 --- /dev/null +++ b/.semversioner/next-release/patch-20240812232353903544.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Improves filtering for data dir inferring" +} diff --git a/graphrag/query/cli.py b/graphrag/query/cli.py index 16cfe0c9..915807a3 100644 --- a/graphrag/query/cli.py +++ b/graphrag/query/cli.py @@ -4,7 +4,7 @@ """Command line interface for the query module.""" import asyncio -import os +import re from pathlib import Path from typing import cast @@ -129,7 +129,9 @@ def _infer_data_dir(root: str) -> str: output = Path(root) / "output" # use the latest data-run folder if output.exists(): - folders = sorted(output.iterdir(), key=os.path.getmtime, reverse=True) + expr = re.compile(r"\d{8}-\d{6}") + filtered = [f for f in output.iterdir() if f.is_dir() and expr.match(f.name)] + folders = sorted(filtered, key=lambda f: f.name, reverse=True) if len(folders) > 0: folder = folders[0] return str((folder / "artifacts").absolute()) diff --git a/tests/unit/query/__init__.py b/tests/unit/query/__init__.py new file mode 100644 index 00000000..0a3e38ad --- /dev/null +++ b/tests/unit/query/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License diff --git a/tests/unit/query/data/defaults/output/20240812-120000/empty.txt b/tests/unit/query/data/defaults/output/20240812-120000/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/query/data/defaults/output/20240812-121000/empty.txt b/tests/unit/query/data/defaults/output/20240812-121000/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/query/data/empty/something-else/empty.txt b/tests/unit/query/data/empty/something-else/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/query/data/hidden/output/.another/empty.txt b/tests/unit/query/data/hidden/output/.another/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/query/data/hidden/output/.hidden b/tests/unit/query/data/hidden/output/.hidden new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/query/data/hidden/output/20240812-120000/empty.txt b/tests/unit/query/data/hidden/output/20240812-120000/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/query/data/hidden/output/20240812-121000/empty.txt b/tests/unit/query/data/hidden/output/20240812-121000/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/query/data/non-numeric/output/20240812-120000/empty.txt b/tests/unit/query/data/non-numeric/output/20240812-120000/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/query/data/non-numeric/output/20240812-121000/empty.txt b/tests/unit/query/data/non-numeric/output/20240812-121000/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/query/data/non-numeric/output/something-else/empty.txt b/tests/unit/query/data/non-numeric/output/something-else/empty.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/query/test_infer_data_dir.py b/tests/unit/query/test_infer_data_dir.py new file mode 100644 index 00000000..c950eb35 --- /dev/null +++ b/tests/unit/query/test_infer_data_dir.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License +from pathlib import Path + +import pytest + +from graphrag.query.cli import _infer_data_dir + + +def test_infer_data_dir(): + root = "./tests/unit/query/data/defaults" + result = Path(_infer_data_dir(root)) + assert result.parts[-2] == "20240812-121000" + + +def test_infer_data_dir_ignores_hidden_files(): + """A hidden file, starting with '.', will naturally be selected as latest data directory.""" + root = "./tests/unit/query/data/hidden" + result = Path(_infer_data_dir(root)) + assert result.parts[-2] == "20240812-121000" + + +def test_infer_data_dir_ignores_non_numeric(): + root = "./tests/unit/query/data/non-numeric" + result = Path(_infer_data_dir(root)) + assert result.parts[-2] == "20240812-121000" + + +def test_infer_data_dir_throws_on_no_match(): + root = "./tests/unit/query/data/empty" + with pytest.raises(ValueError): # noqa PT011 (this is what is actually thrown...) + _infer_data_dir(root)