Add /documents/get_by_filters endpoint (#1580)

* Add endpoint to get documents by filter

* Add test for /documents/get_by_filter and extend the delete documents test

* Add rest_api/file-upload to .gitignore

* Make sure the document store is empty for each test

* Improve docstrings of delete_documents_by_filters and get_documents_by_filters

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Sara Zan 2021-10-12 10:53:54 +02:00 committed by GitHub
parent bc7167a96c
commit 6354528336
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 148 additions and 24 deletions

5
.gitignore vendored
View File

@ -143,7 +143,7 @@ dmypy.json
# haystack files
haystack/document_store/qa.db
data
mlruns
**/mlruns/**
src
tutorials/cache
tutorials/mlruns
@ -151,5 +151,6 @@ tutorials/model
models
saved_models
*_build
rest_api/file-upload/*
**/feedback_squad_direct.json
.DS_Store

View File

@ -279,7 +279,7 @@ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreade
others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
Note: With "UTF-8" we experienced cases, where a simple "fi" gets wrongly parsed as
"xef\xac\x81c" (see test cases). That's why we keep "Latin 1" as default here.
(See list of available encodings by running `pdftotext -listencodings` in the terminal)
(See list of available encodings by running `pdftotext -listenc` in the terminal)
<a name="pdf.PDFToTextOCRConverter"></a>
## PDFToTextOCRConverter Objects

View File

@ -137,7 +137,7 @@ If batch_size is set to None, this method will yield all documents and labels.
#### convert\_files\_to\_dicts
```python
convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False) -> List[dict]
convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, encoding: Optional[str] = None) -> List[dict]
```
Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
@ -148,6 +148,7 @@ Document Store.
- `dir_path`: path for the documents to be written to the DocumentStore
- `clean_func`: a custom cleaning function that gets applied to each doc (input: str, output:str)
- `split_paragraphs`: split text in paragraphs.
- `encoding`: character encoding to use when converting pdf documents.
**Returns**:

View File

@ -70,6 +70,8 @@ fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
Haystack's converter classes are designed to help you turn files on your computer into the documents
that can be processed by the Haystack pipeline.
There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.
For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great.
```python

View File

@ -1,10 +1,13 @@
from typing import List
import logging
from fastapi import APIRouter
from haystack import Document
from rest_api.controller.search import DOCUMENT_STORE
from rest_api.config import LOG_LEVEL
from rest_api.schema import FilterRequest
from rest_api.schema import FilterRequest, DocumentResponse
logging.getLogger("haystack").setLevel(LOG_LEVEL)
@ -14,13 +17,31 @@ logger = logging.getLogger("haystack")
router = APIRouter()
@router.post("/documents/get_by_filters", response_model=List[DocumentResponse])
def get_documents_by_filter(filters: FilterRequest):
"""
Can be used to get documents from a document store.
:param filters: Filters to narrow down the documents to delete.
Example: '{"filters": {{"name": ["some", "more"], "category": ["only_one"]}}'
To get all documents you should provide an empty dict, like:
'{"filters": {}}'
"""
docs = [doc.to_dict() for doc in DOCUMENT_STORE.get_all_documents(filters=filters.filters)]
for doc in docs:
del doc["embedding"]
return [DocumentResponse(**doc) for doc in docs]
@router.post("/documents/delete_by_filters", response_model=bool)
def delete_documents(filters: FilterRequest):
def delete_documents_by_filter(filters: FilterRequest):
"""
Can be used to delete documents from a document store.
:param filters: Filters to narrow down the documents to delete.
Example: {"name": ["some", "more"], "category": ["only_one"]}
Example: '{"filters": {{"name": ["some", "more"], "category": ["only_one"]}}'
To delete all documents you should provide an empty dict, like:
'{"filters": {}}'
"""
DOCUMENT_STORE.delete_documents(filters=filters.filters)
return True

View File

@ -1,5 +1,6 @@
from typing import Dict, List, Optional, Union, Any
from pydantic import BaseModel, Field
from haystack import Document
class QueryRequest(BaseModel):
@ -30,6 +31,16 @@ class QueryResponse(BaseModel):
answers: List[QueryAnswer]
class DocumentResponse(BaseModel):
text: str
id: Optional[str] = None
score: Optional[float] = None
question: Optional[str] = None
meta: Dict[str, Any] = None
#embedding: Optional[np.ndarray] = None
id_hash_keys: Optional[List[str]] = None
class ExtractiveQAFeedback(BaseModel):
question: str = Field(..., description="The question input by the user, i.e., the query.")
is_correct_answer: bool = Field(..., description="Whether the answer is correct or not.")

View File

@ -55,3 +55,15 @@ pipelines:
inputs: [Preprocessor]
- name: DocumentStore
inputs: [ESRetriever]
- name: indexing_text_pipeline
type: Pipeline
nodes:
- name: TextConverter
inputs: [File]
- name: Preprocessor
inputs: [TextConverter]
- name: ESRetriever
inputs: [Preprocessor]
- name: DocumentStore
inputs: [ESRetriever]

View File

@ -7,33 +7,109 @@ from fastapi.testclient import TestClient
from rest_api.application import app
@pytest.fixture(scope="session")
@pytest.fixture
def client() -> TestClient:
os.environ["PIPELINE_YAML_PATH"] = str((Path(__file__).parent / "samples"/"pipeline"/"test_pipeline.yaml").absolute())
os.environ["QUERY_PIPELINE_NAME"] = "query_pipeline"
os.environ["INDEXING_PIPELINE_NAME"] = "indexing_pipeline"
return TestClient(app)
@pytest.fixture(scope="session")
def populated_client(client: TestClient) -> TestClient:
file_to_upload = {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')}
client.post(url="/file-upload", files=file_to_upload, data={"meta": '{"meta_key": "meta_value"}'})
os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline"
client = TestClient(app)
yield client
client.post(url="/documents/delete_by_filters", data={"meta_key": ["meta_value"]})
# Clean up
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
@pytest.fixture
def populated_client(client: TestClient) -> TestClient:
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
files_to_upload = [
{'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')},
{'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_2.pdf").open('rb')}
]
for index, fi in enumerate(files_to_upload):
response = client.post(url="/file-upload", files=fi, data={"meta": f'{{"meta_key": "meta_value", "meta_index": "{index}"}}'})
assert 200 == response.status_code
yield client
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
def test_get_documents():
os.environ["PIPELINE_YAML_PATH"] = str((Path(__file__).parent / "samples"/"pipeline"/"test_pipeline.yaml").absolute())
os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline"
client = TestClient(app)
# Clean up to make sure the docstore is empty
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
# Upload the files
files_to_upload = [
{'files': (Path(__file__).parent / "samples"/"docs"/"doc_1.txt").open('rb')},
{'files': (Path(__file__).parent / "samples"/"docs"/"doc_2.txt").open('rb')}
]
for index, fi in enumerate(files_to_upload):
response = client.post(url="/file-upload", files=fi, data={"meta": f'{{"meta_key": "meta_value_get"}}'})
assert 200 == response.status_code
# Get the documents
response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_key": ["meta_value_get"]}}')
assert 200 == response.status_code
response_json = response.json()
# Make sure the right docs are found
assert len(response_json) == 2
names = [doc["meta"]["name"] for doc in response_json]
assert "doc_1.txt" in names
assert "doc_2.txt" in names
meta_keys = [doc["meta"]["meta_key"] for doc in response_json]
assert all("meta_value_get"==meta_key for meta_key in meta_keys)
def test_delete_documents():
os.environ["PIPELINE_YAML_PATH"] = str((Path(__file__).parent / "samples"/"pipeline"/"test_pipeline.yaml").absolute())
os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline"
client = TestClient(app)
# Clean up to make sure the docstore is empty
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
# Upload the files
files_to_upload = [
{'files': (Path(__file__).parent / "samples"/"docs"/"doc_1.txt").open('rb')},
{'files': (Path(__file__).parent / "samples"/"docs"/"doc_2.txt").open('rb')}
]
for index, fi in enumerate(files_to_upload):
response = client.post(url="/file-upload", files=fi, data={"meta": f'{{"meta_key": "meta_value_del", "meta_index": "{index}"}}'})
assert 200 == response.status_code
# Make sure there are two docs
response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_key": ["meta_value_del"]}}')
assert 200 == response.status_code
response_json = response.json()
assert len(response_json) == 2
# Delete one doc
response = client.post(url="/documents/delete_by_filters", data='{"filters": {"meta_index": ["0"]}}')
assert 200 == response.status_code
# Now there should be only one doc
response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_key": ["meta_value_del"]}}')
assert 200 == response.status_code
response_json = response.json()
assert len(response_json) == 1
# Make sure the right doc was deleted
response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_index": ["0"]}}')
assert 200 == response.status_code
response_json = response.json()
assert len(response_json) == 0
response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_index": ["1"]}}')
assert 200 == response.status_code
response_json = response.json()
assert len(response_json) == 1
def test_file_upload(client: TestClient):
file_to_upload = {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')}
response = client.post(url="/file-upload", files=file_to_upload, data={"meta": '{"meta_key": "meta_value"}'})
assert 200 == response.status_code
def test_delete_documents(client: TestClient):
file_to_upload = {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')}
response = client.post(url="/file-upload", files=file_to_upload, data={"meta": '{"meta_key": "meta_value"}'})
client.post(url="/documents/delete_by_filters", data={"meta_key": ["meta_value"]})
assert 200 == response.status_code
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
def test_query_with_no_filter(populated_client: TestClient):
query_with_no_filter_value = {"query": "Who made the PDF specification?"}