mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
Add /documents/get_by_filters endpoint (#1580)
* Add endpoint to get documents by filter * Add test for /documents/get_by_filter and extend the delete documents test * Add rest_api/file-upload to .gitignore * Make sure the document store is empty for each test * Improve docstrings of delete_documents_by_filters and get_documents_by_filters Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
bc7167a96c
commit
6354528336
5
.gitignore
vendored
5
.gitignore
vendored
@ -143,7 +143,7 @@ dmypy.json
|
||||
# haystack files
|
||||
haystack/document_store/qa.db
|
||||
data
|
||||
mlruns
|
||||
**/mlruns/**
|
||||
src
|
||||
tutorials/cache
|
||||
tutorials/mlruns
|
||||
@ -151,5 +151,6 @@ tutorials/model
|
||||
models
|
||||
saved_models
|
||||
*_build
|
||||
|
||||
rest_api/file-upload/*
|
||||
**/feedback_squad_direct.json
|
||||
.DS_Store
|
||||
|
||||
@ -279,7 +279,7 @@ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreade
|
||||
others if your doc contains special characters (e.g. German Umlauts, Cyrillic characters ...).
|
||||
Note: With "UTF-8" we experienced cases, where a simple "fi" gets wrongly parsed as
|
||||
"xef\xac\x81c" (see test cases). That's why we keep "Latin 1" as default here.
|
||||
(See list of available encodings by running `pdftotext -listencodings` in the terminal)
|
||||
(See list of available encodings by running `pdftotext -listenc` in the terminal)
|
||||
|
||||
<a name="pdf.PDFToTextOCRConverter"></a>
|
||||
## PDFToTextOCRConverter Objects
|
||||
|
||||
@ -137,7 +137,7 @@ If batch_size is set to None, this method will yield all documents and labels.
|
||||
#### convert\_files\_to\_dicts
|
||||
|
||||
```python
|
||||
convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False) -> List[dict]
|
||||
convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, encoding: Optional[str] = None) -> List[dict]
|
||||
```
|
||||
|
||||
Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Python dicts that can be written to a
|
||||
@ -148,6 +148,7 @@ Document Store.
|
||||
- `dir_path`: path for the documents to be written to the DocumentStore
|
||||
- `clean_func`: a custom cleaning function that gets applied to each doc (input: str, output:str)
|
||||
- `split_paragraphs`: split text in paragraphs.
|
||||
- `encoding`: character encoding to use when converting pdf documents.
|
||||
|
||||
**Returns**:
|
||||
|
||||
|
||||
@ -70,6 +70,8 @@ fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
|
||||
Haystack's converter classes are designed to help you turn files on your computer into the documents
|
||||
that can be processed by the Haystack pipeline.
|
||||
There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika.
|
||||
The parameter `valid_langugages` does not convert files to the target language, but checks if the conversion worked as expected.
|
||||
For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't great.
|
||||
|
||||
|
||||
```python
|
||||
|
||||
@ -1,10 +1,13 @@
|
||||
from typing import List
|
||||
|
||||
import logging
|
||||
|
||||
from fastapi import APIRouter
|
||||
from haystack import Document
|
||||
|
||||
from rest_api.controller.search import DOCUMENT_STORE
|
||||
from rest_api.config import LOG_LEVEL
|
||||
from rest_api.schema import FilterRequest
|
||||
from rest_api.schema import FilterRequest, DocumentResponse
|
||||
|
||||
|
||||
logging.getLogger("haystack").setLevel(LOG_LEVEL)
|
||||
@ -14,13 +17,31 @@ logger = logging.getLogger("haystack")
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/documents/get_by_filters", response_model=List[DocumentResponse])
|
||||
def get_documents_by_filter(filters: FilterRequest):
|
||||
"""
|
||||
Can be used to get documents from a document store.
|
||||
|
||||
:param filters: Filters to narrow down the documents to delete.
|
||||
Example: '{"filters": {{"name": ["some", "more"], "category": ["only_one"]}}'
|
||||
To get all documents you should provide an empty dict, like:
|
||||
'{"filters": {}}'
|
||||
"""
|
||||
docs = [doc.to_dict() for doc in DOCUMENT_STORE.get_all_documents(filters=filters.filters)]
|
||||
for doc in docs:
|
||||
del doc["embedding"]
|
||||
return [DocumentResponse(**doc) for doc in docs]
|
||||
|
||||
|
||||
@router.post("/documents/delete_by_filters", response_model=bool)
|
||||
def delete_documents(filters: FilterRequest):
|
||||
def delete_documents_by_filter(filters: FilterRequest):
|
||||
"""
|
||||
Can be used to delete documents from a document store.
|
||||
|
||||
:param filters: Filters to narrow down the documents to delete.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
Example: '{"filters": {{"name": ["some", "more"], "category": ["only_one"]}}'
|
||||
To delete all documents you should provide an empty dict, like:
|
||||
'{"filters": {}}'
|
||||
"""
|
||||
DOCUMENT_STORE.delete_documents(filters=filters.filters)
|
||||
return True
|
||||
@ -1,5 +1,6 @@
|
||||
from typing import Dict, List, Optional, Union, Any
|
||||
from pydantic import BaseModel, Field
|
||||
from haystack import Document
|
||||
|
||||
|
||||
class QueryRequest(BaseModel):
|
||||
@ -30,6 +31,16 @@ class QueryResponse(BaseModel):
|
||||
answers: List[QueryAnswer]
|
||||
|
||||
|
||||
class DocumentResponse(BaseModel):
|
||||
text: str
|
||||
id: Optional[str] = None
|
||||
score: Optional[float] = None
|
||||
question: Optional[str] = None
|
||||
meta: Dict[str, Any] = None
|
||||
#embedding: Optional[np.ndarray] = None
|
||||
id_hash_keys: Optional[List[str]] = None
|
||||
|
||||
|
||||
class ExtractiveQAFeedback(BaseModel):
|
||||
question: str = Field(..., description="The question input by the user, i.e., the query.")
|
||||
is_correct_answer: bool = Field(..., description="Whether the answer is correct or not.")
|
||||
|
||||
@ -55,3 +55,15 @@ pipelines:
|
||||
inputs: [Preprocessor]
|
||||
- name: DocumentStore
|
||||
inputs: [ESRetriever]
|
||||
|
||||
- name: indexing_text_pipeline
|
||||
type: Pipeline
|
||||
nodes:
|
||||
- name: TextConverter
|
||||
inputs: [File]
|
||||
- name: Preprocessor
|
||||
inputs: [TextConverter]
|
||||
- name: ESRetriever
|
||||
inputs: [Preprocessor]
|
||||
- name: DocumentStore
|
||||
inputs: [ESRetriever]
|
||||
|
||||
@ -7,33 +7,109 @@ from fastapi.testclient import TestClient
|
||||
from rest_api.application import app
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@pytest.fixture
|
||||
def client() -> TestClient:
|
||||
os.environ["PIPELINE_YAML_PATH"] = str((Path(__file__).parent / "samples"/"pipeline"/"test_pipeline.yaml").absolute())
|
||||
os.environ["QUERY_PIPELINE_NAME"] = "query_pipeline"
|
||||
os.environ["INDEXING_PIPELINE_NAME"] = "indexing_pipeline"
|
||||
return TestClient(app)
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def populated_client(client: TestClient) -> TestClient:
|
||||
file_to_upload = {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')}
|
||||
client.post(url="/file-upload", files=file_to_upload, data={"meta": '{"meta_key": "meta_value"}'})
|
||||
os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline"
|
||||
client = TestClient(app)
|
||||
yield client
|
||||
client.post(url="/documents/delete_by_filters", data={"meta_key": ["meta_value"]})
|
||||
# Clean up
|
||||
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def populated_client(client: TestClient) -> TestClient:
|
||||
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
|
||||
files_to_upload = [
|
||||
{'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')},
|
||||
{'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_2.pdf").open('rb')}
|
||||
]
|
||||
for index, fi in enumerate(files_to_upload):
|
||||
response = client.post(url="/file-upload", files=fi, data={"meta": f'{{"meta_key": "meta_value", "meta_index": "{index}"}}'})
|
||||
assert 200 == response.status_code
|
||||
yield client
|
||||
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
|
||||
|
||||
|
||||
def test_get_documents():
|
||||
os.environ["PIPELINE_YAML_PATH"] = str((Path(__file__).parent / "samples"/"pipeline"/"test_pipeline.yaml").absolute())
|
||||
os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline"
|
||||
client = TestClient(app)
|
||||
|
||||
# Clean up to make sure the docstore is empty
|
||||
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
|
||||
|
||||
# Upload the files
|
||||
files_to_upload = [
|
||||
{'files': (Path(__file__).parent / "samples"/"docs"/"doc_1.txt").open('rb')},
|
||||
{'files': (Path(__file__).parent / "samples"/"docs"/"doc_2.txt").open('rb')}
|
||||
]
|
||||
for index, fi in enumerate(files_to_upload):
|
||||
response = client.post(url="/file-upload", files=fi, data={"meta": f'{{"meta_key": "meta_value_get"}}'})
|
||||
assert 200 == response.status_code
|
||||
|
||||
# Get the documents
|
||||
response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_key": ["meta_value_get"]}}')
|
||||
assert 200 == response.status_code
|
||||
response_json = response.json()
|
||||
|
||||
# Make sure the right docs are found
|
||||
assert len(response_json) == 2
|
||||
names = [doc["meta"]["name"] for doc in response_json]
|
||||
assert "doc_1.txt" in names
|
||||
assert "doc_2.txt" in names
|
||||
meta_keys = [doc["meta"]["meta_key"] for doc in response_json]
|
||||
assert all("meta_value_get"==meta_key for meta_key in meta_keys)
|
||||
|
||||
|
||||
def test_delete_documents():
|
||||
os.environ["PIPELINE_YAML_PATH"] = str((Path(__file__).parent / "samples"/"pipeline"/"test_pipeline.yaml").absolute())
|
||||
os.environ["INDEXING_PIPELINE_NAME"] = "indexing_text_pipeline"
|
||||
client = TestClient(app)
|
||||
|
||||
# Clean up to make sure the docstore is empty
|
||||
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
|
||||
|
||||
# Upload the files
|
||||
files_to_upload = [
|
||||
{'files': (Path(__file__).parent / "samples"/"docs"/"doc_1.txt").open('rb')},
|
||||
{'files': (Path(__file__).parent / "samples"/"docs"/"doc_2.txt").open('rb')}
|
||||
]
|
||||
for index, fi in enumerate(files_to_upload):
|
||||
response = client.post(url="/file-upload", files=fi, data={"meta": f'{{"meta_key": "meta_value_del", "meta_index": "{index}"}}'})
|
||||
assert 200 == response.status_code
|
||||
|
||||
# Make sure there are two docs
|
||||
response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_key": ["meta_value_del"]}}')
|
||||
assert 200 == response.status_code
|
||||
response_json = response.json()
|
||||
assert len(response_json) == 2
|
||||
|
||||
# Delete one doc
|
||||
response = client.post(url="/documents/delete_by_filters", data='{"filters": {"meta_index": ["0"]}}')
|
||||
assert 200 == response.status_code
|
||||
|
||||
# Now there should be only one doc
|
||||
response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_key": ["meta_value_del"]}}')
|
||||
assert 200 == response.status_code
|
||||
response_json = response.json()
|
||||
assert len(response_json) == 1
|
||||
|
||||
# Make sure the right doc was deleted
|
||||
response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_index": ["0"]}}')
|
||||
assert 200 == response.status_code
|
||||
response_json = response.json()
|
||||
assert len(response_json) == 0
|
||||
response = client.post(url="/documents/get_by_filters", data='{"filters": {"meta_index": ["1"]}}')
|
||||
assert 200 == response.status_code
|
||||
response_json = response.json()
|
||||
assert len(response_json) == 1
|
||||
|
||||
def test_file_upload(client: TestClient):
|
||||
file_to_upload = {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')}
|
||||
response = client.post(url="/file-upload", files=file_to_upload, data={"meta": '{"meta_key": "meta_value"}'})
|
||||
assert 200 == response.status_code
|
||||
|
||||
def test_delete_documents(client: TestClient):
|
||||
file_to_upload = {'files': (Path(__file__).parent / "samples"/"pdf"/"sample_pdf_1.pdf").open('rb')}
|
||||
response = client.post(url="/file-upload", files=file_to_upload, data={"meta": '{"meta_key": "meta_value"}'})
|
||||
|
||||
client.post(url="/documents/delete_by_filters", data={"meta_key": ["meta_value"]})
|
||||
assert 200 == response.status_code
|
||||
client.post(url="/documents/delete_by_filters", data='{"filters": {}}')
|
||||
|
||||
def test_query_with_no_filter(populated_client: TestClient):
|
||||
query_with_no_filter_value = {"query": "Who made the PDF specification?"}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user