Add /documents/get_by_filters endpoint (#1580)

* Add endpoint to get documents by filter * Add test for /documents/get_by_filter and extend the delete documents test * Add rest_api/file-upload to .gitignore * Make sure the document store is empty for each test * Improve docstrings of delete_documents_by_filters and get_documents_by_filters Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2026-01-06 03:57:19 +00:00 · 2021-10-12 10:53:54 +02:00 · 2021-10-12 10:53:54 +02:00 · 6354528336
commit 6354528336
parent bc7167a96c
8 changed files with 148 additions and 24 deletions
--- a/.gitignore
+++ b/.gitignore
@ -143,7 +143,7 @@ dmypy.json
 # haystack files
 haystack/document_store/qa.db
 data
-mlruns
+**/mlruns/**
 src
 tutorials/cache
 tutorials/mlruns
@ -151,5 +151,6 @@ tutorials/model
 models
 saved_models
 *_build
-
+rest_api/file-upload/*
+**/feedback_squad_direct.json
 .DS_Store
--- a/docs/_src/api/api/file_converter.md
+++ b/docs/_src/api/api/file_converter.md
@ -279,7 +279,7 @@ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreade
                 others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
                 Note: With "UTF-8" we experienced cases, where a simple "fi" gets wrongly parsed as
                 "xef\xac\x81c" (see test cases). That's why we keep "Latin 1" as default here.
-                 (See list of available encodings by running `pdftotext -listencodings` in the terminal)
+                 (See list of available encodings by running `pdftotext -listenc` in the terminal)

 <a name="pdf.PDFToTextOCRConverter"></a>
 ## PDFToTextOCRConverter Objects
--- a/docs/_src/api/api/preprocessor.md
+++ b/docs/_src/api/api/preprocessor.md
@ -137,7 +137,7 @@ If batch_size is set to None, this method will yield all documents and labels.
 #### convert\_files\_to\_dicts

 ```python
-convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False) -> List[dict]
+convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, encoding: Optional[str] = None) -> List[dict]
 ```

 Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
@ -148,6 +148,7 @@ Document Store.
 - `dir_path`: path for the documents to be written to the DocumentStore
 - `clean_func`: a custom cleaning function that gets applied to each doc (input: str, output:str)
 - `split_paragraphs`: split text in paragraphs.
+- `encoding`: character encoding to use when converting pdf documents.

 **Returns**:

--- a/docs/_src/tutorials/tutorials/8.md
+++ b/docs/_src/tutorials/tutorials/8.md
@ -70,6 +70,8 @@ fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 Haystack's converter classes are designed to help you turn files on your computer into the documents
 that can be processed by the Haystack pipeline.
 There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
+The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.
+For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great.


 ```python
--- a/rest_api/controller/document.py
+++ b/rest_api/controller/document.py
@ -1,10 +1,13 @@
+from typing import List
+
 import logging

 from fastapi import APIRouter
+from haystack import Document

 from rest_api.controller.search import DOCUMENT_STORE
 from rest_api.config import LOG_LEVEL
-from rest_api.schema import FilterRequest
+from rest_api.schema import FilterRequest, DocumentResponse


 logging.getLogger("haystack").setLevel(LOG_LEVEL)
@ -14,13 +17,31 @@ logger = logging.getLogger("haystack")
 router = APIRouter()


+@router.post("/documents/get_by_filters", response_model=List[DocumentResponse])
+def get_documents_by_filter(filters: FilterRequest):
+    """
+    Can be used to get documents from a document store.
+
+    :param filters: Filters to narrow down the documents to delete.
+                    Example: '{"filters": {{"name": ["some", "more"], "category": ["only_one"]}}'
+                    To get all documents you should provide an empty dict, like:
+                    '{"filters": {}}'
+    """
+    docs = [doc.to_dict() for doc in DOCUMENT_STORE.get_all_documents(filters=filters.filters)]
+    for doc in docs:
+        del doc["embedding"]
+    return [DocumentResponse(**doc) for doc in docs]
+
+
@router.post("/documents/delete_by_filters", response_model=bool)
-def delete_documents(filters: FilterRequest):
+def delete_documents_by_filter(filters: FilterRequest):
    """
    Can be used to delete documents from a document store.

    :param filters: Filters to narrow down the documents to delete.
-                    Example: {"name": ["some", "more"], "category": ["only_one"]}
+                    Example: '{"filters": {{"name": ["some", "more"], "category": ["only_one"]}}'
+                    To delete all documents you should provide an empty dict, like:
+                    '{"filters": {}}'
    """
    DOCUMENT_STORE.delete_documents(filters=filters.filters)
    return True
--- a/rest_api/schema.py
+++ b/rest_api/schema.py
@ -1,5 +1,6 @@
 from typing import Dict, List, Optional, Union, Any
 from pydantic import BaseModel, Field
+from haystack import Document


 class QueryRequest(BaseModel):
@ -30,6 +31,16 @@ class QueryResponse(BaseModel):
    answers: List[QueryAnswer]


+class DocumentResponse(BaseModel):
+    text: str
+    id: Optional[str] = None
+    score: Optional[float] = None
+    question: Optional[str] = None
+    meta: Dict[str, Any] = None
+    #embedding: Optional[np.ndarray] = None
+    id_hash_keys: Optional[List[str]] = None
+
+
 class ExtractiveQAFeedback(BaseModel):
    question: str = Field(..., description="The question input by the user, i.e., the query.")
    is_correct_answer: bool = Field(..., description="Whether the answer is correct or not.")
--- a/test/samples/pipeline/test_pipeline.yaml
+++ b/test/samples/pipeline/test_pipeline.yaml
@ -55,3 +55,15 @@ pipelines:
        inputs: [Preprocessor]
      - name: DocumentStore
        inputs: [ESRetriever]
+
+  - name: indexing_text_pipeline
+    type: Pipeline
+    nodes:
+      - name: TextConverter
+        inputs: [File]
+      - name: Preprocessor
+        inputs: [TextConverter]
+      - name: ESRetriever
+        inputs: [Preprocessor]
+      - name: DocumentStore
+        inputs: [ESRetriever]
--- a/test/test_rest_api.py
+++ b/test/test_rest_api.py
@ -7,33 +7,109 @@ from fastapi.testclient import TestClient
 from rest_api.application import app


-@pytest.fixture(scope="session")
+@pytest.fixture
 def client() -> TestClient:
    os.environ["PIPELINE_YAML_PATH"] = str((Path(__file__).parent / "samples"/"pipeline"/"test_pipeline.yaml").absolute())
-    os.environ["QUERY_PIPELINE_NAME"] = "query_pipeline"
-    os.environ["INDEXING_PIPELINE_NAME"] = "indexing_pipeline"
-    return TestClient(app)
-
-@pytest.fixture(scope="session")
-def populated_client(client: TestClient) -> TestClient:
-    file_to_upload = {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')}
-    client.post(url="/file-upload", files=file_to_upload, data={"meta": '{"meta_key": "meta_value"}'})
+    os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline"
+    client = TestClient(app)
    yield client
-    client.post(url="/documents/delete_by_filters", data={"meta_key": ["meta_value"]})
+    # Clean up
+    client.post(url="/documents/delete_by_filters", data='{"filters": {}}')


+@pytest.fixture
+def populated_client(client: TestClient) -> TestClient:
+    client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
+    files_to_upload = [
+        {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')},
+        {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_2.pdf").open('rb')}
+    ]
+    for index, fi in enumerate(files_to_upload):
+        response = client.post(url="/file-upload", files=fi, data={"meta": f'{{"meta_key": "meta_value", "meta_index": "{index}"}}'})
+        assert 200 == response.status_code
+    yield client
+    client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
+
+
+def test_get_documents():
+    os.environ["PIPELINE_YAML_PATH"] = str((Path(__file__).parent / "samples"/"pipeline"/"test_pipeline.yaml").absolute())
+    os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline"
+    client = TestClient(app)
+
+    # Clean up to make sure the docstore is empty
+    client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
+
+    # Upload the files
+    files_to_upload = [
+        {'files': (Path(__file__).parent / "samples"/"docs"/"doc_1.txt").open('rb')},
+        {'files': (Path(__file__).parent / "samples"/"docs"/"doc_2.txt").open('rb')}
+    ]
+    for index, fi in enumerate(files_to_upload):
+        response = client.post(url="/file-upload", files=fi, data={"meta": f'{{"meta_key": "meta_value_get"}}'})
+        assert 200 == response.status_code
+
+    # Get the documents
+    response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_key": ["meta_value_get"]}}')
+    assert 200 == response.status_code
+    response_json = response.json()
+    
+    # Make sure the right docs are found
+    assert len(response_json) == 2
+    names = [doc["meta"]["name"] for doc in response_json]
+    assert "doc_1.txt" in names
+    assert "doc_2.txt" in names
+    meta_keys = [doc["meta"]["meta_key"] for doc in response_json]
+    assert all("meta_value_get"==meta_key for meta_key in meta_keys)
+
+
+def test_delete_documents():
+    os.environ["PIPELINE_YAML_PATH"] = str((Path(__file__).parent / "samples"/"pipeline"/"test_pipeline.yaml").absolute())
+    os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline"
+    client = TestClient(app)
+
+    # Clean up to make sure the docstore is empty
+    client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
+
+    # Upload the files
+    files_to_upload = [
+        {'files': (Path(__file__).parent / "samples"/"docs"/"doc_1.txt").open('rb')},
+        {'files': (Path(__file__).parent / "samples"/"docs"/"doc_2.txt").open('rb')}
+    ]
+    for index, fi in enumerate(files_to_upload):
+        response = client.post(url="/file-upload", files=fi, data={"meta": f'{{"meta_key": "meta_value_del", "meta_index": "{index}"}}'})
+        assert 200 == response.status_code
+
+    # Make sure there are two docs
+    response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_key": ["meta_value_del"]}}')
+    assert 200 == response.status_code
+    response_json = response.json()
+    assert len(response_json) == 2
+
+    # Delete one doc    
+    response = client.post(url="/documents/delete_by_filters", data='{"filters": {"meta_index": ["0"]}}')
+    assert 200 == response.status_code
+
+    # Now there should be only one doc
+    response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_key": ["meta_value_del"]}}')
+    assert 200 == response.status_code
+    response_json = response.json()
+    assert len(response_json) == 1
+    
+    # Make sure the right doc was deleted
+    response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_index": ["0"]}}')
+    assert 200 == response.status_code
+    response_json = response.json()
+    assert len(response_json) == 0
+    response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_index": ["1"]}}')
+    assert 200 == response.status_code
+    response_json = response.json()
+    assert len(response_json) == 1

 def test_file_upload(client: TestClient):
    file_to_upload = {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')}
    response = client.post(url="/file-upload", files=file_to_upload, data={"meta": '{"meta_key": "meta_value"}'})
    assert 200 == response.status_code
-
-def test_delete_documents(client: TestClient):
-    file_to_upload = {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')}
-    response = client.post(url="/file-upload", files=file_to_upload, data={"meta": '{"meta_key": "meta_value"}'})
-
-    client.post(url="/documents/delete_by_filters", data={"meta_key": ["meta_value"]})
-    assert 200 == response.status_code
+    client.post(url="/documents/delete_by_filters", data='{"filters": {}}')

 def test_query_with_no_filter(populated_client: TestClient):
    query_with_no_filter_value = {"query": "Who made the PDF specification?"}