refactor: update dependencies and remove pins (#3147)

* refactor: remove azure-core, pydoc and hf-hub pins

* fix: remove extra-comma

* fix: force minimum version of azure forms recognizer

* refactor: allow newer ocr libs

* refactor: update more dependencies and container versions

* refactor: remove extra comment

* docs: pre-commit manual run

* refactor: remove unnecessary dependency

* tests: update weaviate container image version
This commit is contained in:
Daniel Bichuetti 2022-09-05 09:30:35 -03:00 committed by GitHub
parent b07fcb7185
commit e1f399284f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 2117 additions and 403 deletions

View File

@ -39,7 +39,7 @@ done
# Run the containers
docker run -d -p 9200:9200 --name elasticsearch -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
docker run -d -p 9998:9998 --name tika -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1
docker run -d -p 9998:9998 --name tika -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4
failed=""

View File

@ -656,7 +656,7 @@ jobs:
- name: Run Opensearch
run: |
docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.2.4
docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.3.5
- name: Run Milvus
run: |
@ -672,7 +672,7 @@ jobs:
run: docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11
- name: Run Apache Tika
run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1
run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4
- name: Run Parsr
run: docker run -d -p 3001:3001 axarev/parsr:v1.2.2

View File

@ -28,7 +28,7 @@ jobs:
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
- name: Run Apache Tika
run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1
run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4
- name: Run GraphDB
run: docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11

View File

@ -199,7 +199,7 @@ docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_AN
docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11
# Tika
docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1
docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4
```
Tests can be also run **individually**:

View File

@ -27,7 +27,17 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
#### Crawler.\_\_init\_\_
```python
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None)
def __init__(output_dir: str,
urls: Optional[List[str]] = None,
crawler_depth: int = 1,
filter_urls: Optional[List] = None,
overwrite_existing_files=True,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text=True,
loading_wait_time: Optional[int] = None,
crawler_naming_function: Optional[Callable[[str, str],
str]] = None,
webdriver_options: Optional[List[str]] = None)
```
Init object with basic params for crawling (can be overwritten later).
@ -73,7 +83,17 @@ See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#
#### Crawler.crawl
```python
def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> List[Path]
def crawl(
output_dir: Union[str, Path, None] = None,
urls: Optional[List[str]] = None,
crawler_depth: Optional[int] = None,
filter_urls: Optional[List] = None,
overwrite_existing_files: Optional[bool] = None,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = None,
loading_wait_time: Optional[int] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None
) -> List[Path]
```
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
@ -116,7 +136,18 @@ List of paths where the crawled webpages got stored
#### Crawler.run
```python
def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
def run(
output_dir: Union[str, Path, None] = None,
urls: Optional[List[str]] = None,
crawler_depth: Optional[int] = None,
filter_urls: Optional[List] = None,
overwrite_existing_files: Optional[bool] = None,
return_documents: Optional[bool] = False,
id_hash_keys: Optional[List[str]] = None,
extract_hidden_text: Optional[bool] = True,
loading_wait_time: Optional[int] = None,
crawler_naming_function: Optional[Callable[[str, str], str]] = None
) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
```
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.

View File

@ -84,7 +84,19 @@ With this document_classifier, you can directly get predictions via predict()
#### TransformersDocumentClassifier.\_\_init\_\_
```python
def __init__(model_name_or_path: str = "bhadresh-savani/distilbert-base-uncased-emotion", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, return_all_scores: bool = False, task: str = "text-classification", labels: Optional[List[str]] = None, batch_size: int = 16, classification_field: str = None, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
def __init__(model_name_or_path:
str = "bhadresh-savani/distilbert-base-uncased-emotion",
model_version: Optional[str] = None,
tokenizer: Optional[str] = None,
use_gpu: bool = True,
return_all_scores: bool = False,
task: str = "text-classification",
labels: Optional[List[str]] = None,
batch_size: int = 16,
classification_field: str = None,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
devices: Optional[List[Union[str, torch.device]]] = None)
```
Load a text classification model from Transformers.
@ -132,7 +144,8 @@ parameter is not used and a single cpu device is used for inference.
#### TransformersDocumentClassifier.predict
```python
def predict(documents: List[Document], batch_size: Optional[int] = None) -> List[Document]
def predict(documents: List[Document],
batch_size: Optional[int] = None) -> List[Document]
```
Returns documents containing classification result in a meta field.
@ -153,7 +166,10 @@ A list of Documents enriched with meta information.
#### TransformersDocumentClassifier.predict\_batch
```python
def predict_batch(documents: Union[List[Document], List[List[Document]]], batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]]
def predict_batch(
documents: Union[List[Document], List[List[Document]]],
batch_size: Optional[int] = None
) -> Union[List[Document], List[List[Document]]]
```
Returns documents containing classification result in meta field.

File diff suppressed because it is too large Load Diff

View File

@ -40,7 +40,9 @@ When False, correct retrieval is evaluated based on document_id.
#### EvalDocuments.run
```python
def run(documents: List[Document], labels: List[Label], top_k: Optional[int] = None)
def run(documents: List[Document],
labels: List[Label],
top_k: Optional[int] = None)
```
Run this node on one sample and its labels
@ -78,7 +80,10 @@ Please use pipeline.eval() instead.
#### EvalAnswers.\_\_init\_\_
```python
def __init__(skip_incorrect_retrieval: bool = True, open_domain: bool = True, sas_model: str = None, debug: bool = False)
def __init__(skip_incorrect_retrieval: bool = True,
open_domain: bool = True,
sas_model: str = None,
debug: bool = False)
```
**Arguments**:
@ -123,7 +128,15 @@ Print the evaluation results
#### semantic\_answer\_similarity
```python
def semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", batch_size: int = 32, use_gpu: bool = True, use_auth_token: Optional[Union[str, bool]] = None) -> Tuple[List[float], List[float], List[List[float]]]
def semantic_answer_similarity(
predictions: List[List[str]],
gold_labels: List[List[str]],
sas_model_name_or_path:
str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
batch_size: int = 32,
use_gpu: bool = True,
use_auth_token: Optional[Union[str, bool]] = None
) -> Tuple[List[float], List[float], List[List[float]]]
```
Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.

View File

@ -39,7 +39,9 @@ parameter is not used and a single cpu device is used for inference.
#### EntityExtractor.run
```python
def run(documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str]
def run(
documents: Optional[Union[List[Document], List[dict]]] = None
) -> Tuple[Dict, str]
```
This is the method called when this node is used in a pipeline
@ -59,7 +61,8 @@ This function can be called to perform entity extraction when using the node in
#### EntityExtractor.extract\_batch
```python
def extract_batch(texts: Union[List[str], List[List[str]]], batch_size: Optional[int] = None)
def extract_batch(texts: Union[List[str], List[List[str]]],
batch_size: Optional[int] = None)
```
This function allows to extract entities out of a list of strings or a list of lists of strings.

View File

@ -33,7 +33,8 @@ Lists with duplicate elements are not allowed.
#### FileTypeClassifier.run
```python
def run(file_paths: Union[Path, List[Path], str, List[str], List[Union[Path, str]]])
def run(file_paths: Union[Path, List[Path], str, List[str], List[Union[Path,
str]]])
```
Sends out files on a different output edge depending on their extension.

View File

@ -17,7 +17,10 @@ Base class for implementing file converts to transform input documents to text f
#### BaseConverter.\_\_init\_\_
```python
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True)
def __init__(remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
progress_bar: bool = True)
```
**Arguments**:
@ -44,7 +47,12 @@ In this case the id will be generated by using the content and the defined metad
```python
@abstractmethod
def convert(file_path: Path, meta: Optional[Dict[str, Any]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path,
meta: Optional[Dict[str, Any]],
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Convert a file to a dictionary containing the text and any associated meta data.
@ -77,7 +85,8 @@ In this case the id will be generated by using the content and the defined metad
#### BaseConverter.validate\_language
```python
def validate_language(text: str, valid_languages: Optional[List[str]] = None) -> bool
def validate_language(text: str,
valid_languages: Optional[List[str]] = None) -> bool
```
Validate if the language of the text is one of valid languages.
@ -87,7 +96,14 @@ Validate if the language of the text is one of valid languages.
#### BaseConverter.run
```python
def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None)
def run(file_paths: Union[Path, List[Path]],
meta: Optional[Union[Dict[str, str],
List[Optional[Dict[str, str]]]]] = None,
remove_numeric_tables: Optional[bool] = None,
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
id_hash_keys: Optional[List[str]] = None)
```
Extract text from a file.
@ -137,7 +153,12 @@ class DocxToTextConverter(BaseConverter)
#### DocxToTextConverter.convert
```python
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Extract text from a .docx file.
@ -182,7 +203,9 @@ class ImageToTextConverter(BaseConverter)
#### ImageToTextConverter.\_\_init\_\_
```python
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None)
def __init__(remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = ["eng"],
id_hash_keys: Optional[List[str]] = None)
```
**Arguments**:
@ -209,7 +232,12 @@ In this case the id will be generated by using the content and the defined metad
#### ImageToTextConverter.convert
```python
def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Union[Path, str],
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
@ -252,7 +280,12 @@ class MarkdownConverter(BaseConverter)
#### MarkdownConverter.convert
```python
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Reads text from a txt file and executes optional preprocessing steps.
@ -301,7 +334,10 @@ class PDFToTextConverter(BaseConverter)
#### PDFToTextConverter.\_\_init\_\_
```python
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
def __init__(remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8")
```
**Arguments**:
@ -329,7 +365,12 @@ Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts,
#### PDFToTextConverter.convert
```python
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
@ -369,7 +410,9 @@ class PDFToTextOCRConverter(BaseConverter)
#### PDFToTextOCRConverter.\_\_init\_\_
```python
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None)
def __init__(remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = ["eng"],
id_hash_keys: Optional[List[str]] = None)
```
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
@ -396,7 +439,12 @@ In this case the id will be generated by using the content and the defined metad
#### PDFToTextOCRConverter.convert
```python
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Convert a file to a dictionary containing the text and any associated meta data.
@ -446,7 +494,17 @@ Supported file formats are: PDF, DOCX
#### ParsrConverter.\_\_init\_\_
```python
def __init__(parsr_url: str = "http://localhost:3001", extractor: Literal["pdfminer", "pdfjs"] = "pdfminer", table_detection_mode: Literal["lattice", "stream"] = "lattice", preceding_context_len: int = 3, following_context_len: int = 3, remove_page_headers: bool = False, remove_page_footers: bool = False, remove_table_of_contents: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True)
def __init__(parsr_url: str = "http://localhost:3001",
extractor: Literal["pdfminer", "pdfjs"] = "pdfminer",
table_detection_mode: Literal["lattice", "stream"] = "lattice",
preceding_context_len: int = 3,
following_context_len: int = 3,
remove_page_headers: bool = False,
remove_page_footers: bool = False,
remove_table_of_contents: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
add_page_number: bool = True)
```
**Arguments**:
@ -480,7 +538,12 @@ In this case the id will be generated by using the content and the defined metad
#### ParsrConverter.convert
```python
def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path,
meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Extract text and tables from a PDF or DOCX using the open-source Parsr tool.
@ -529,7 +592,16 @@ https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quick
#### AzureConverter.\_\_init\_\_
```python
def __init__(endpoint: str, credential_key: str, model_id: str = "prebuilt-document", valid_languages: Optional[List[str]] = None, save_json: bool = False, preceding_context_len: int = 3, following_context_len: int = 3, merge_multiple_column_headers: bool = True, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True)
def __init__(endpoint: str,
credential_key: str,
model_id: str = "prebuilt-document",
valid_languages: Optional[List[str]] = None,
save_json: bool = False,
preceding_context_len: int = 3,
following_context_len: int = 3,
merge_multiple_column_headers: bool = True,
id_hash_keys: Optional[List[str]] = None,
add_page_number: bool = True)
```
**Arguments**:
@ -564,7 +636,14 @@ In this case the id will be generated by using the content and the defined metad
#### AzureConverter.convert
```python
def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, pages: Optional[str] = None, known_language: Optional[str] = None) -> List[Document]
def convert(file_path: Path,
meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None,
pages: Optional[str] = None,
known_language: Optional[str] = None) -> List[Document]
```
Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service.
@ -596,7 +675,11 @@ See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales
#### AzureConverter.convert\_azure\_json
```python
def convert_azure_json(file_path: Path, meta: Optional[Dict[str, Any]] = None, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert_azure_json(
file_path: Path,
meta: Optional[Dict[str, Any]] = None,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Extract text and tables from the JSON output of Azure's Form Recognizer service.
@ -633,7 +716,10 @@ class TikaConverter(BaseConverter)
#### TikaConverter.\_\_init\_\_
```python
def __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None)
def __init__(tika_url: str = "http://localhost:9998/tika",
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None)
```
**Arguments**:
@ -659,7 +745,12 @@ In this case the id will be generated by using the content and the defined metad
#### TikaConverter.convert
```python
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
**Arguments**:
@ -703,7 +794,12 @@ class TextConverter(BaseConverter)
#### TextConverter.convert
```python
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None,
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "utf-8",
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Reads text from a txt file and executes optional preprocessing steps.

View File

@ -18,7 +18,8 @@ Abstract class for Generators
```python
@abstractmethod
def predict(query: str, documents: List[Document], top_k: Optional[int]) -> Dict
def predict(query: str, documents: List[Document],
top_k: Optional[int]) -> Dict
```
Abstract method to generate answers.
@ -38,7 +39,10 @@ Generated answers plus additional infos in a dict
#### BaseGenerator.predict\_batch
```python
def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
def predict_batch(queries: List[str],
documents: Union[List[Document], List[List[Document]]],
top_k: Optional[int] = None,
batch_size: Optional[int] = None)
```
Generate the answer to the input queries. The generation will be conditioned on the supplied documents.
@ -138,7 +142,20 @@ i.e. the model can easily adjust to domain documents even after training has fin
#### RAGenerator.\_\_init\_\_
```python
def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
def __init__(model_name_or_path: str = "facebook/rag-token-nq",
model_version: Optional[str] = None,
retriever: Optional[DensePassageRetriever] = None,
generator_type: str = "token",
top_k: int = 2,
max_length: int = 200,
min_length: int = 2,
num_beams: int = 2,
embed_title: bool = True,
prefix: Optional[str] = None,
use_gpu: bool = True,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
devices: Optional[List[Union[str, torch.device]]] = None)
```
Load a RAG model from Transformers along with passage_embedding_model.
@ -176,7 +193,9 @@ parameter is not used and a single cpu device is used for inference.
#### RAGenerator.predict
```python
def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
def predict(query: str,
documents: List[Document],
top_k: Optional[int] = None) -> Dict
```
Generate the answer to the input query. The generation will be conditioned on the supplied documents.
@ -266,7 +285,16 @@ the [Hugging Face Model Hub](https://huggingface.co/models?pipeline_tag=text2tex
#### Seq2SeqGenerator.\_\_init\_\_
```python
def __init__(model_name_or_path: str, input_converter: Optional[Callable] = None, top_k: int = 1, max_length: int = 200, min_length: int = 2, num_beams: int = 8, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
def __init__(model_name_or_path: str,
input_converter: Optional[Callable] = None,
top_k: int = 1,
max_length: int = 200,
min_length: int = 2,
num_beams: int = 8,
use_gpu: bool = True,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
devices: Optional[List[Union[str, torch.device]]] = None)
```
**Arguments**:
@ -298,7 +326,9 @@ parameter is not used and a single cpu device is used for inference.
#### Seq2SeqGenerator.predict
```python
def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
def predict(query: str,
documents: List[Document],
top_k: Optional[int] = None) -> Dict
```
Generate the answer to the input query. The generation will be conditioned on the supplied documents.
@ -338,7 +368,17 @@ on the [OpenAI API website](https://openai.com/api/).
#### OpenAIAnswerGenerator.\_\_init\_\_
```python
def __init__(api_key: str, model: str = "text-curie-001", max_tokens: int = 7, top_k: int = 5, temperature: int = 0, presence_penalty: float = -2.0, frequency_penalty: float = -2.0, examples_context: Optional[str] = None, examples: Optional[List] = None, stop_words: Optional[List] = None, progress_bar: bool = True)
def __init__(api_key: str,
model: str = "text-curie-001",
max_tokens: int = 7,
top_k: int = 5,
temperature: int = 0,
presence_penalty: float = -2.0,
frequency_penalty: float = -2.0,
examples_context: Optional[str] = None,
examples: Optional[List] = None,
stop_words: Optional[List] = None,
progress_bar: bool = True)
```
**Arguments**:
@ -374,7 +414,9 @@ If you don't provide it, the default from OpenAPI docs is used: ["\n", "<|endoft
#### OpenAIAnswerGenerator.predict
```python
def predict(query: str, documents: List[Document], top_k: Optional[int] = None)
def predict(query: str,
documents: List[Document],
top_k: Optional[int] = None)
```
Use the loaded QA model to generate Answers for a query based on the Documents it receives.

View File

@ -45,7 +45,10 @@ The node allows multiple join modes:
#### JoinDocuments.\_\_init\_\_
```python
def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None, sort_by_score: bool = True)
def __init__(join_mode: str = "concatenate",
weights: Optional[List[float]] = None,
top_k_join: Optional[int] = None,
sort_by_score: bool = True)
```
**Arguments**:
@ -79,7 +82,10 @@ A node to join `Answer`s produced by multiple `Reader` nodes.
#### JoinAnswers.\_\_init\_\_
```python
def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None, sort_by_score: bool = True)
def __init__(join_mode: str = "concatenate",
weights: Optional[List[float]] = None,
top_k_join: Optional[int] = None,
sort_by_score: bool = True)
```
**Arguments**:
@ -114,7 +120,8 @@ different nodes.
#### RouteDocuments.\_\_init\_\_
```python
def __init__(split_by: str = "content_type", metadata_values: Optional[List[str]] = None)
def __init__(split_by: str = "content_type",
metadata_values: Optional[List[str]] = None)
```
**Arguments**:

View File

@ -42,7 +42,9 @@ Note that this also includes such components that are being utilized by other co
#### Pipeline.to\_code
```python
def to_code(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = False) -> str
def to_code(pipeline_variable_name: str = "pipeline",
generate_imports: bool = True,
add_comment: bool = False) -> str
```
Returns the code to create this pipeline as string.
@ -61,7 +63,9 @@ Default value is False.
#### Pipeline.to\_notebook\_cell
```python
def to_notebook_cell(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = True)
def to_notebook_cell(pipeline_variable_name: str = "pipeline",
generate_imports: bool = True,
add_comment: bool = True)
```
Creates a new notebook cell with the code to create this pipeline.
@ -81,7 +85,13 @@ Default value is True.
```python
@classmethod
def load_from_deepset_cloud(cls, pipeline_config_name: str, pipeline_name: str = "query", workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite_with_env_variables: bool = False)
def load_from_deepset_cloud(cls,
pipeline_config_name: str,
pipeline_name: str = "query",
workspace: str = "default",
api_key: Optional[str] = None,
api_endpoint: Optional[str] = None,
overwrite_with_env_variables: bool = False)
```
Load Pipeline from Deepset Cloud defining the individual components and how they're tied together to form
@ -114,7 +124,11 @@ variable 'READER_PARAMS_RETURN_NO_ANSWER=False' can be set. Note that an
```python
@classmethod
def list_pipelines_on_deepset_cloud(cls, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None) -> List[dict]
def list_pipelines_on_deepset_cloud(
cls,
workspace: str = "default",
api_key: Optional[str] = None,
api_endpoint: Optional[str] = None) -> List[dict]
```
Lists all pipeline configs available on Deepset Cloud.
@ -150,7 +164,14 @@ Returns:
```python
@classmethod
def save_to_deepset_cloud(cls, query_pipeline: Pipeline, index_pipeline: Pipeline, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite: bool = False)
def save_to_deepset_cloud(cls,
query_pipeline: Pipeline,
index_pipeline: Pipeline,
pipeline_config_name: str,
workspace: str = "default",
api_key: Optional[str] = None,
api_endpoint: Optional[str] = None,
overwrite: bool = False)
```
Saves a Pipeline config to Deepset Cloud defining the individual components and how they're tied together to form
@ -175,7 +196,13 @@ If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment varia
```python
@classmethod
def deploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60, show_curl_message: bool = True)
def deploy_on_deepset_cloud(cls,
pipeline_config_name: str,
workspace: str = "default",
api_key: Optional[str] = None,
api_endpoint: Optional[str] = None,
timeout: int = 60,
show_curl_message: bool = True)
```
Deploys the pipelines of a pipeline config on Deepset Cloud.
@ -205,7 +232,12 @@ If the timeout is exceeded an error will be raised.
```python
@classmethod
def undeploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60)
def undeploy_on_deepset_cloud(cls,
pipeline_config_name: str,
workspace: str = "default",
api_key: Optional[str] = None,
api_endpoint: Optional[str] = None,
timeout: int = 60)
```
Undeploys the pipelines of a pipeline config on Deepset Cloud.
@ -285,7 +317,13 @@ Set the component for a node in the Pipeline.
#### Pipeline.run
```python
def run(query: Optional[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[MultiLabel] = None, documents: Optional[List[Document]] = None, meta: Optional[Union[dict, List[dict]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None)
def run(query: Optional[str] = None,
file_paths: Optional[List[str]] = None,
labels: Optional[MultiLabel] = None,
documents: Optional[List[Document]] = None,
meta: Optional[Union[dict, List[dict]]] = None,
params: Optional[dict] = None,
debug: Optional[bool] = None)
```
Runs the Pipeline, one node at a time.
@ -310,7 +348,15 @@ the Nodes received and the output they generated. You can then find all debug in
#### Pipeline.run\_batch
```python
def run_batch(queries: List[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None, documents: Optional[Union[List[Document], List[List[Document]]]] = None, meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None)
def run_batch(queries: List[str] = None,
file_paths: Optional[List[str]] = None,
labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
documents: Optional[Union[List[Document],
List[List[Document]]]] = None,
meta: Optional[Union[Dict[str, Any], List[Dict[str,
Any]]]] = None,
params: Optional[dict] = None,
debug: Optional[bool] = None)
```
Runs the Pipeline in a batch mode, one node at a time. The batch mode means that the Pipeline can take more than one query as input. You can use this method for query pipelines only. When used with an indexing pipeline, it calls the pipeline `run()` method.
@ -346,7 +392,18 @@ the Nodes received and the output they generated. You can then find all debug in
```python
@classmethod
def eval_beir(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict = {}, query_params: dict = {}, dataset: str = "scifact", dataset_dir: Path = Path("."), top_k_values: List[int] = [1, 3, 5, 10, 100, 1000], keep_index: bool = False) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]
def eval_beir(
cls,
index_pipeline: Pipeline,
query_pipeline: Pipeline,
index_params: dict = {},
query_params: dict = {},
dataset: str = "scifact",
dataset_dir: Path = Path("."),
top_k_values: List[int] = [1, 3, 5, 10, 100, 1000],
keep_index: bool = False
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str,
float]]
```
Runs information retrieval evaluation of a pipeline using BEIR on a specified BEIR dataset.
@ -375,14 +432,38 @@ Each metric is represented by a dictionary containing the scores for each top_k
```python
@classmethod
def execute_eval_run(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, evaluation_set_labels: List[MultiLabel], corpus_file_paths: List[str], experiment_name: str, experiment_run_name: str, experiment_tracking_tool: Literal["mlflow", None] = None, experiment_tracking_uri: Optional[str] = None, corpus_file_metas: List[Dict[str, Any]] = None, corpus_meta: Dict[str, Any] = {}, evaluation_set_meta: Dict[str, Any] = {}, pipeline_meta: Dict[str, Any] = {}, index_params: dict = {}, query_params: dict = {}, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, use_batch_mode: bool = False, add_isolated_node_eval: bool = False, reuse_index: bool = False, custom_document_id_field: Optional[str] = None, document_scope: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"document_id_or_answer",
] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult
def execute_eval_run(
cls,
index_pipeline: Pipeline,
query_pipeline: Pipeline,
evaluation_set_labels: List[MultiLabel],
corpus_file_paths: List[str],
experiment_name: str,
experiment_run_name: str,
experiment_tracking_tool: Literal["mlflow", None] = None,
experiment_tracking_uri: Optional[str] = None,
corpus_file_metas: List[Dict[str, Any]] = None,
corpus_meta: Dict[str, Any] = {},
evaluation_set_meta: Dict[str, Any] = {},
pipeline_meta: Dict[str, Any] = {},
index_params: dict = {},
query_params: dict = {},
sas_model_name_or_path: str = None,
sas_batch_size: int = 32,
sas_use_gpu: bool = True,
use_batch_mode: bool = False,
add_isolated_node_eval: bool = False,
reuse_index: bool = False,
custom_document_id_field: Optional[str] = None,
document_scope: Literal[
"document_id", "context", "document_id_and_context",
"document_id_or_context", "answer",
"document_id_or_answer", ] = "document_id_or_answer",
answer_scope: Literal["any", "context", "document_id",
"document_id_and_context"] = "any",
context_matching_min_length: int = 100,
context_matching_boost_split_overlaps: bool = True,
context_matching_threshold: float = 65.0) -> EvaluationResult
```
Starts an experiment run that first indexes the specified files (forming a corpus) using the index pipeline
@ -510,7 +591,19 @@ Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scori
```python
@send_event
def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0, use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult
def eval(
labels: List[MultiLabel],
documents: Optional[List[List[Document]]] = None,
params: Optional[dict] = None,
sas_model_name_or_path: Optional[str] = None,
sas_batch_size: int = 32,
sas_use_gpu: bool = True,
add_isolated_node_eval: bool = False,
custom_document_id_field: Optional[str] = None,
context_matching_min_length: int = 100,
context_matching_boost_split_overlaps: bool = True,
context_matching_threshold: float = 65.0,
use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult
```
Evaluates the pipeline by running the pipeline once per query in debug mode
@ -576,7 +669,19 @@ https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrai
```python
@send_event
def eval_batch(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0, use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult
def eval_batch(
labels: List[MultiLabel],
documents: Optional[List[List[Document]]] = None,
params: Optional[dict] = None,
sas_model_name_or_path: Optional[str] = None,
sas_batch_size: int = 32,
sas_use_gpu: bool = True,
add_isolated_node_eval: bool = False,
custom_document_id_field: Optional[str] = None,
context_matching_min_length: int = 100,
context_matching_boost_split_overlaps: bool = True,
context_matching_threshold: float = 65.0,
use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult
```
Evaluates the pipeline by running it in batches in the debug mode
@ -690,7 +795,11 @@ Create a Graphviz visualization of the pipeline.
```python
@classmethod
def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, strict_version_check: bool = False)
def load_from_yaml(cls,
path: Path,
pipeline_name: Optional[str] = None,
overwrite_with_env_variables: bool = True,
strict_version_check: bool = False)
```
Load Pipeline from a YAML file defining the individual components and how they're tied together to form
@ -747,7 +856,11 @@ variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
```python
@classmethod
def load_from_config(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, strict_version_check: bool = False)
def load_from_config(cls,
pipeline_config: Dict,
pipeline_name: Optional[str] = None,
overwrite_with_env_variables: bool = True,
strict_version_check: bool = False)
```
Load Pipeline from a config dict defining the individual components and how they're tied together to form
@ -832,14 +945,19 @@ Returns a configuration for the Pipeline that can be used with `Pipeline.load_fr
#### Pipeline.print\_eval\_report
```python
def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None, document_scope: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"document_id_or_answer",
] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", wrong_examples_fields: List[str] = ["answer", "context", "document_id"], max_characters_per_field: int = 150)
def print_eval_report(eval_result: EvaluationResult,
n_wrong_examples: int = 3,
metrics_filter: Optional[Dict[str, List[str]]] = None,
document_scope: Literal[
"document_id", "context", "document_id_and_context",
"document_id_or_context", "answer",
"document_id_or_answer", ] = "document_id_or_answer",
answer_scope: Literal["any", "context", "document_id",
"document_id_and_context"] = "any",
wrong_examples_fields: List[str] = [
"answer", "context", "document_id"
],
max_characters_per_field: int = 150)
```
Prints evaluation report containing a metrics funnel and worst queries for further analysis.
@ -892,7 +1010,8 @@ class _HaystackBeirRetrieverAdapter()
#### \_HaystackBeirRetrieverAdapter.\_\_init\_\_
```python
def __init__(index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict, query_params: dict)
def __init__(index_pipeline: Pipeline, query_pipeline: Pipeline,
index_params: dict, query_params: dict)
```
Adapter mimicking a BEIR retriever used by BEIR's EvaluateRetrieval class to run BEIR evaluations on Haystack Pipelines.
@ -959,7 +1078,9 @@ YAML definitions of Ray pipelines are validated at load. For more information, s
#### RayPipeline.\_\_init\_\_
```python
def __init__(address: str = None, ray_args: Optional[Dict[str, Any]] = None, serve_args: Optional[Dict[str, Any]] = None)
def __init__(address: str = None,
ray_args: Optional[Dict[str, Any]] = None,
serve_args: Optional[Dict[str, Any]] = None)
```
**Arguments**:
@ -974,7 +1095,14 @@ def __init__(address: str = None, ray_args: Optional[Dict[str, Any]] = None, ser
```python
@classmethod
def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, address: Optional[str] = None, strict_version_check: bool = False, ray_args: Optional[Dict[str, Any]] = None, serve_args: Optional[Dict[str, Any]] = None)
def load_from_yaml(cls,
path: Path,
pipeline_name: Optional[str] = None,
overwrite_with_env_variables: bool = True,
address: Optional[str] = None,
strict_version_check: bool = False,
ray_args: Optional[Dict[str, Any]] = None,
serve_args: Optional[Dict[str, Any]] = None)
```
Load Pipeline from a YAML file defining the individual components and how they're tied together to form
@ -1189,7 +1317,10 @@ Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_
```python
@classmethod
def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True)
def load_from_yaml(cls,
path: Path,
pipeline_name: Optional[str] = None,
overwrite_with_env_variables: bool = True)
```
Load Pipeline from a YAML file defining the individual components and how they're tied together to form
@ -1277,7 +1408,16 @@ Instance of DocumentStore or None
#### BaseStandardPipeline.eval
```python
def eval(labels: List[MultiLabel], params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult
def eval(labels: List[MultiLabel],
params: Optional[dict] = None,
sas_model_name_or_path: Optional[str] = None,
sas_batch_size: int = 32,
sas_use_gpu: bool = True,
add_isolated_node_eval: bool = False,
custom_document_id_field: Optional[str] = None,
context_matching_min_length: int = 100,
context_matching_boost_split_overlaps: bool = True,
context_matching_threshold: float = 65.0) -> EvaluationResult
```
Evaluates the pipeline by running the pipeline once per query in debug mode
@ -1318,7 +1458,16 @@ Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scori
#### BaseStandardPipeline.eval\_batch
```python
def eval_batch(labels: List[MultiLabel], params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult
def eval_batch(labels: List[MultiLabel],
params: Optional[dict] = None,
sas_model_name_or_path: Optional[str] = None,
sas_batch_size: int = 32,
sas_use_gpu: bool = True,
add_isolated_node_eval: bool = False,
custom_document_id_field: Optional[str] = None,
context_matching_min_length: int = 100,
context_matching_boost_split_overlaps: bool = True,
context_matching_threshold: float = 65.0) -> EvaluationResult
```
Evaluates the pipeline by running the pipeline once per query in the debug mode
@ -1358,14 +1507,19 @@ To calculate SAS (Semantic Answer Similarity) metrics, specify `sas_model_name_o
#### BaseStandardPipeline.print\_eval\_report
```python
def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None, document_scope: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"document_id_or_answer",
] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", wrong_examples_fields: List[str] = ["answer", "context", "document_id"], max_characters_per_field: int = 150)
def print_eval_report(eval_result: EvaluationResult,
n_wrong_examples: int = 3,
metrics_filter: Optional[Dict[str, List[str]]] = None,
document_scope: Literal[
"document_id", "context", "document_id_and_context",
"document_id_or_context", "answer",
"document_id_or_answer", ] = "document_id_or_answer",
answer_scope: Literal["any", "context", "document_id",
"document_id_and_context"] = "any",
wrong_examples_fields: List[str] = [
"answer", "context", "document_id"
],
max_characters_per_field: int = 150)
```
Prints evaluation report containing a metrics funnel and worst queries for further analysis.
@ -1410,7 +1564,9 @@ In Question Answering, to enforce that the retrieved document is considered corr
#### BaseStandardPipeline.run\_batch
```python
def run_batch(queries: List[str], params: Optional[dict] = None, debug: Optional[bool] = None)
def run_batch(queries: List[str],
params: Optional[dict] = None,
debug: Optional[bool] = None)
```
Run a batch of queries through the pipeline.
@ -1454,7 +1610,9 @@ def __init__(reader: BaseReader, retriever: BaseRetriever)
#### ExtractiveQAPipeline.run
```python
def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
def run(query: str,
params: Optional[dict] = None,
debug: Optional[bool] = None)
```
**Arguments**:
@ -1495,7 +1653,9 @@ def __init__(retriever: BaseRetriever)
#### DocumentSearchPipeline.run
```python
def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
def run(query: str,
params: Optional[dict] = None,
debug: Optional[bool] = None)
```
**Arguments**:
@ -1536,7 +1696,9 @@ def __init__(generator: BaseGenerator, retriever: BaseRetriever)
#### GenerativeQAPipeline.run
```python
def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
def run(query: str,
params: Optional[dict] = None,
debug: Optional[bool] = None)
```
**Arguments**:
@ -1565,7 +1727,9 @@ Pipeline that retrieves documents for a query and then summarizes those document
#### SearchSummarizationPipeline.\_\_init\_\_
```python
def __init__(summarizer: BaseSummarizer, retriever: BaseRetriever, return_in_answer_format: bool = False)
def __init__(summarizer: BaseSummarizer,
retriever: BaseRetriever,
return_in_answer_format: bool = False)
```
**Arguments**:
@ -1581,7 +1745,9 @@ pipeline as a "drop-in replacement" for other QA pipelines.
#### SearchSummarizationPipeline.run
```python
def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
def run(query: str,
params: Optional[dict] = None,
debug: Optional[bool] = None)
```
**Arguments**:
@ -1600,7 +1766,9 @@ by this method under the key "_debug"
#### SearchSummarizationPipeline.run\_batch
```python
def run_batch(queries: List[str], params: Optional[dict] = None, debug: Optional[bool] = None)
def run_batch(queries: List[str],
params: Optional[dict] = None,
debug: Optional[bool] = None)
```
Run a batch of queries through the pipeline.
@ -1643,7 +1811,9 @@ def __init__(retriever: BaseRetriever)
#### FAQPipeline.run
```python
def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
def run(query: str,
params: Optional[dict] = None,
debug: Optional[bool] = None)
```
**Arguments**:
@ -1672,7 +1842,9 @@ Takes an existing search pipeline and adds one "input translation node" after th
#### TranslationWrapperPipeline.\_\_init\_\_
```python
def __init__(input_translator: BaseTranslator, output_translator: BaseTranslator, pipeline: BaseStandardPipeline)
def __init__(input_translator: BaseTranslator,
output_translator: BaseTranslator,
pipeline: BaseStandardPipeline)
```
Wrap a given `pipeline` with the `input_translator` and `output_translator`.

View File

@ -16,7 +16,16 @@ class BasePreProcessor(BaseComponent)
```python
@abstractmethod
def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def process(documents: Union[dict, Document, List[Union[dict, Document]]],
clean_whitespace: Optional[bool] = True,
clean_header_footer: Optional[bool] = False,
clean_empty_lines: Optional[bool] = True,
remove_substrings: List[str] = [],
split_by: Optional[str] = "word",
split_length: Optional[int] = 1000,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = True,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a
@ -39,7 +48,19 @@ class PreProcessor(BasePreProcessor)
#### PreProcessor.\_\_init\_\_
```python
def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, tokenizer_model_folder: Optional[Union[str, Path]] = None, language: str = "en", id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True, add_page_number: bool = False)
def __init__(clean_whitespace: bool = True,
clean_header_footer: bool = False,
clean_empty_lines: bool = True,
remove_substrings: List[str] = [],
split_by: str = "word",
split_length: int = 200,
split_overlap: int = 0,
split_respect_sentence_boundary: bool = True,
tokenizer_model_folder: Optional[Union[str, Path]] = None,
language: str = "en",
id_hash_keys: Optional[List[str]] = None,
progress_bar: bool = True,
add_page_number: bool = False)
```
**Arguments**:
@ -80,7 +101,16 @@ in between pages by `PDFToTextConverter`, `TikaConverter`, `ParsrConverter` and
#### PreProcessor.process
```python
def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def process(documents: Union[dict, Document, List[Union[dict, Document]]],
clean_whitespace: Optional[bool] = None,
clean_header_footer: Optional[bool] = None,
clean_empty_lines: Optional[bool] = None,
remove_substrings: List[str] = [],
split_by: Optional[str] = None,
split_length: Optional[int] = None,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
@ -90,7 +120,12 @@ Perform document cleaning and splitting. Can take a single document or a list of
#### PreProcessor.clean
```python
def clean(document: Union[dict, Document], clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, remove_substrings: List[str], id_hash_keys: Optional[List[str]] = None) -> Document
def clean(document: Union[dict, Document],
clean_whitespace: bool,
clean_header_footer: bool,
clean_empty_lines: bool,
remove_substrings: List[str],
id_hash_keys: Optional[List[str]] = None) -> Document
```
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
@ -101,7 +136,12 @@ and empty lines. Its exact functionality is defined by the parameters passed int
#### PreProcessor.split
```python
def split(document: Union[dict, Document], split_by: str, split_length: int, split_overlap: int, split_respect_sentence_boundary: bool, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def split(document: Union[dict, Document],
split_by: str,
split_length: int,
split_overlap: int,
split_respect_sentence_boundary: bool,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Perform document splitting on a single document. This method can split on different units, at different lengths,

View File

@ -16,7 +16,13 @@ class Document()
#### Document.\_\_init\_\_
```python
def __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image", "audio"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None)
def __init__(content: Union[str, pd.DataFrame],
content_type: Literal["text", "table", "image", "audio"] = "text",
id: Optional[str] = None,
score: Optional[float] = None,
meta: Dict[str, Any] = None,
embedding: Optional[np.ndarray] = None,
id_hash_keys: Optional[List[str]] = None)
```
One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
@ -79,7 +85,10 @@ dict with content of the Document
```python
@classmethod
def from_dict(cls, dict: Dict[str, Any], field_map: Dict[str, Any] = {}, id_hash_keys: Optional[List[str]] = None) -> Document
def from_dict(cls,
dict: Dict[str, Any],
field_map: Dict[str, Any] = {},
id_hash_keys: Optional[List[str]] = None) -> Document
```
Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
@ -229,7 +238,19 @@ class Label()
#### Label.\_\_init\_\_
```python
def __init__(query: str, document: Document, is_correct_answer: bool, is_correct_document: bool, origin: Literal["user-feedback", "gold-label"], answer: Optional[Answer], id: Optional[str] = None, no_answer: Optional[bool] = None, pipeline_id: Optional[str] = None, created_at: Optional[str] = None, updated_at: Optional[str] = None, meta: Optional[dict] = None, filters: Optional[dict] = None)
def __init__(query: str,
document: Document,
is_correct_answer: bool,
is_correct_document: bool,
origin: Literal["user-feedback", "gold-label"],
answer: Optional[Answer],
id: Optional[str] = None,
no_answer: Optional[bool] = None,
pipeline_id: Optional[str] = None,
created_at: Optional[str] = None,
updated_at: Optional[str] = None,
meta: Optional[dict] = None,
filters: Optional[dict] = None)
```
Object used to represent label/feedback in a standardized way within Haystack.
@ -272,7 +293,10 @@ class MultiLabel()
#### MultiLabel.\_\_init\_\_
```python
def __init__(labels: List[Label], drop_negative_labels=False, drop_no_answers=False, **kwargs)
def __init__(labels: List[Label],
drop_negative_labels=False,
drop_no_answers=False,
**kwargs)
```
There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated
@ -382,14 +406,17 @@ The DataFrames have the following schema:
#### EvaluationResult.calculate\_metrics
```python
def calculate_metrics(simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"document_id_or_answer",
] = "document_id_or_answer", eval_mode: Literal["integrated", "isolated"] = "integrated", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any") -> Dict[str, Dict[str, float]]
def calculate_metrics(
simulated_top_k_reader: int = -1,
simulated_top_k_retriever: int = -1,
document_scope: Literal[
"document_id", "context", "document_id_and_context",
"document_id_or_context", "answer",
"document_id_or_answer", ] = "document_id_or_answer",
eval_mode: Literal["integrated", "isolated"] = "integrated",
answer_scope: Literal["any", "context", "document_id",
"document_id_and_context"] = "any"
) -> Dict[str, Dict[str, float]]
```
Calculates proper metrics for each node.
@ -457,14 +484,23 @@ In Question Answering, to enforce that the retrieved document is considered corr
#### EvaluationResult.wrong\_examples
```python
def wrong_examples(node: str, n: int = 3, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[
"document_id",
"context",
"document_id_and_context",
"document_id_or_context",
"answer",
"document_id_or_answer",
] = "document_id_or_answer", document_metric: str = "recall_single_hit", answer_metric: str = "f1", document_metric_threshold: float = 0.5, answer_metric_threshold: float = 0.5, eval_mode: Literal["integrated", "isolated"] = "integrated", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any") -> List[Dict]
def wrong_examples(
node: str,
n: int = 3,
simulated_top_k_reader: int = -1,
simulated_top_k_retriever: int = -1,
document_scope: Literal[
"document_id", "context", "document_id_and_context",
"document_id_or_context", "answer",
"document_id_or_answer", ] = "document_id_or_answer",
document_metric: str = "recall_single_hit",
answer_metric: str = "f1",
document_metric_threshold: float = 0.5,
answer_metric_threshold: float = 0.5,
eval_mode: Literal["integrated", "isolated"] = "integrated",
answer_scope: Literal["any", "context", "document_id",
"document_id_and_context"] = "any"
) -> List[Dict]
```
Returns the worst performing queries.

View File

@ -53,7 +53,17 @@ For example:
#### PseudoLabelGenerator.\_\_init\_\_
```python
def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], retriever: BaseRetriever, cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None)
def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]],
retriever: BaseRetriever,
cross_encoder_model_name_or_path:
str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
max_questions_per_document: int = 3,
top_k: int = 50,
batch_size: int = 16,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
use_gpu: bool = True,
devices: Optional[List[Union[str, torch.device]]] = None)
```
Loads the cross-encoder model and prepares PseudoLabelGenerator.
@ -84,7 +94,9 @@ parameter is not used and a single cpu device is used for inference.
#### PseudoLabelGenerator.generate\_questions
```python
def generate_questions(documents: List[Document], batch_size: Optional[int] = None) -> List[Dict[str, str]]
def generate_questions(
documents: List[Document],
batch_size: Optional[int] = None) -> List[Dict[str, str]]
```
It takes a list of documents and generates a list of question-document pairs.
@ -103,7 +115,8 @@ A list of question-document pairs.
#### PseudoLabelGenerator.mine\_negatives
```python
def mine_negatives(question_doc_pairs: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict[str, str]]
def mine_negatives(question_doc_pairs: List[Dict[str, str]],
batch_size: Optional[int] = None) -> List[Dict[str, str]]
```
Given a list of question and positive document pairs, this function returns a list of question/positive document/negative document
@ -125,7 +138,8 @@ and negative document.
#### PseudoLabelGenerator.generate\_margin\_scores
```python
def generate_margin_scores(mined_negatives: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict]
def generate_margin_scores(mined_negatives: List[Dict[str, str]],
batch_size: Optional[int] = None) -> List[Dict]
```
Given a list of mined negatives, this function predicts the score margin between the positive and negative document using
@ -157,7 +171,9 @@ A list of dictionaries, each of which has the following keys:
#### PseudoLabelGenerator.generate\_pseudo\_labels
```python
def generate_pseudo_labels(documents: List[Document], batch_size: Optional[int] = None) -> Tuple[dict, str]
def generate_pseudo_labels(
documents: List[Document],
batch_size: Optional[int] = None) -> Tuple[dict, str]
```
Given a list of documents, this function generates a list of question-document pairs, mines for negatives, and

View File

@ -69,11 +69,17 @@ and the further processing can be customized. You can define this by connecting
#### SklearnQueryClassifier.\_\_init\_\_
```python
def __init__(model_name_or_path: Union[
str, Any
] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", vectorizer_name_or_path: Union[
str, Any
] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle", batch_size: Optional[int] = None, progress_bar: bool = True)
def __init__(
model_name_or_path:
Union[
str,
Any] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle",
vectorizer_name_or_path:
Union[
str,
Any] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle",
batch_size: Optional[int] = None,
progress_bar: bool = True)
```
**Arguments**:
@ -144,7 +150,17 @@ This node also supports zero-shot-classification.
#### TransformersQueryClassifier.\_\_init\_\_
```python
def __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, task: str = "text-classification", labels: List[str] = DEFAULT_LABELS, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
def __init__(model_name_or_path: Union[
Path, str] = "shahrukhx01/bert-mini-finetune-question-detection",
model_version: Optional[str] = None,
tokenizer: Optional[str] = None,
use_gpu: bool = True,
task: str = "text-classification",
labels: List[str] = DEFAULT_LABELS,
batch_size: int = 16,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
devices: Optional[List[Union[str, torch.device]]] = None)
```
**Arguments**:

View File

@ -23,7 +23,23 @@ come from earlier in the document.
#### QuestionGenerator.\_\_init\_\_
```python
def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", num_queries_per_doc=1, sep_token: str = "<sep>", batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
def __init__(model_name_or_path="valhalla/t5-base-e2e-qg",
model_version=None,
num_beams=4,
max_length=256,
no_repeat_ngram_size=3,
length_penalty=1.5,
early_stopping=True,
split_length=50,
split_overlap=10,
use_gpu=True,
prompt="generate questions:",
num_queries_per_doc=1,
sep_token: str = "<sep>",
batch_size: int = 16,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
devices: Optional[List[Union[str, torch.device]]] = None)
```
Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is
@ -55,7 +71,10 @@ parameter is not used and a single cpu device is used for inference.
#### QuestionGenerator.generate\_batch
```python
def generate_batch(texts: Union[List[str], List[List[str]]], batch_size: Optional[int] = None) -> Union[List[List[str]], List[List[List[str]]]]
def generate_batch(
texts: Union[List[str], List[List[str]]],
batch_size: Optional[int] = None
) -> Union[List[List[str]], List[List[List[str]]]]
```
Generates questions for a list of strings or a list of lists of strings.

View File

@ -25,7 +25,12 @@ Wrapper method used to time functions.
#### BaseRanker.eval
```python
def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False) -> dict
def eval(label_index: str = "label",
doc_index: str = "eval_document",
label_origin: str = "gold_label",
top_k: int = 10,
open_domain: bool = False,
return_preds: bool = False) -> dict
```
Performs evaluation of the Ranker.
@ -94,7 +99,15 @@ Usage example:
#### SentenceTransformersRanker.\_\_init\_\_
```python
def __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, top_k: int = 10, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, batch_size: int = 16, scale_score: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None)
def __init__(model_name_or_path: Union[str, Path],
model_version: Optional[str] = None,
top_k: int = 10,
use_gpu: bool = True,
devices: Optional[List[Union[str, torch.device]]] = None,
batch_size: int = 16,
scale_score: bool = True,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None)
```
**Arguments**:
@ -125,7 +138,9 @@ parameter is not used and a single cpu device is used for inference.
#### SentenceTransformersRanker.predict
```python
def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> List[Document]
def predict(query: str,
documents: List[Document],
top_k: Optional[int] = None) -> List[Document]
```
Use loaded ranker model to re-rank the supplied list of Document.
@ -147,7 +162,12 @@ List of Document
#### SentenceTransformersRanker.predict\_batch
```python
def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]]
def predict_batch(
queries: List[str],
documents: Union[List[Document], List[List[Document]]],
top_k: Optional[int] = None,
batch_size: Optional[int] = None
) -> Union[List[Document], List[List[Document]]]
```
Use loaded ranker model to re-rank the supplied lists of Documents.

View File

@ -45,7 +45,28 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf
#### FARMReader.\_\_init\_\_
```python
def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None)
def __init__(model_name_or_path: str,
model_version: Optional[str] = None,
context_window_size: int = 150,
batch_size: int = 50,
use_gpu: bool = True,
devices: Optional[List[Union[str, torch.device]]] = None,
no_ans_boost: float = 0.0,
return_no_answer: bool = False,
top_k: int = 10,
top_k_per_candidate: int = 3,
top_k_per_sample: int = 1,
num_processes: Optional[int] = None,
max_seq_len: int = 256,
doc_stride: int = 128,
progress_bar: bool = True,
duplicate_filtering: int = 0,
use_confidence_scores: bool = True,
confidence_threshold: Optional[float] = None,
proxies: Optional[Dict[str, str]] = None,
local_files_only=False,
force_download=False,
use_auth_token: Optional[Union[str, bool]] = None)
```
**Arguments**:
@ -113,7 +134,29 @@ https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrai
#### FARMReader.train
```python
def train(data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), grad_acc_steps: int = 1, early_stopping: Optional[EarlyStopping] = None)
def train(data_dir: str,
train_filename: str,
dev_filename: Optional[str] = None,
test_filename: Optional[str] = None,
use_gpu: Optional[bool] = None,
devices: List[torch.device] = [],
batch_size: int = 10,
n_epochs: int = 2,
learning_rate: float = 1e-5,
max_seq_len: Optional[int] = None,
warmup_proportion: float = 0.2,
dev_split: float = 0,
evaluate_every: int = 300,
save_dir: Optional[str] = None,
num_processes: Optional[int] = None,
use_amp: str = None,
checkpoint_root_dir: Path = Path("model_checkpoints"),
checkpoint_every: Optional[int] = None,
checkpoints_to_keep: int = 3,
caching: bool = False,
cache_path: Path = Path("cache/data_silo"),
grad_acc_steps: int = 1,
early_stopping: Optional[EarlyStopping] = None)
```
Fine-tune a model on a QA dataset. Options:
@ -176,7 +219,36 @@ None
#### FARMReader.distil\_prediction\_layer\_from
```python
def distil_prediction_layer_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], student_batch_size: int = 10, teacher_batch_size: Optional[int] = None, n_epochs: int = 2, learning_rate: float = 3e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss_weight: float = 0.5, distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "kl_div", temperature: float = 1.0, grad_acc_steps: int = 1, early_stopping: Optional[EarlyStopping] = None)
def distil_prediction_layer_from(
teacher_model: "FARMReader",
data_dir: str,
train_filename: str,
dev_filename: Optional[str] = None,
test_filename: Optional[str] = None,
use_gpu: Optional[bool] = None,
devices: List[torch.device] = [],
student_batch_size: int = 10,
teacher_batch_size: Optional[int] = None,
n_epochs: int = 2,
learning_rate: float = 3e-5,
max_seq_len: Optional[int] = None,
warmup_proportion: float = 0.2,
dev_split: float = 0,
evaluate_every: int = 300,
save_dir: Optional[str] = None,
num_processes: Optional[int] = None,
use_amp: str = None,
checkpoint_root_dir: Path = Path("model_checkpoints"),
checkpoint_every: Optional[int] = None,
checkpoints_to_keep: int = 3,
caching: bool = False,
cache_path: Path = Path("cache/data_silo"),
distillation_loss_weight: float = 0.5,
distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor],
torch.Tensor]] = "kl_div",
temperature: float = 1.0,
grad_acc_steps: int = 1,
early_stopping: Optional[EarlyStopping] = None)
```
Fine-tune a model on a QA dataset using logit-based distillation. You need to provide a teacher model that is already finetuned on the dataset
@ -258,7 +330,35 @@ None
#### FARMReader.distil\_intermediate\_layers\_from
```python
def distil_intermediate_layers_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 5, learning_rate: float = 5e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "mse", temperature: float = 1.0, processor: Optional[Processor] = None, grad_acc_steps: int = 1, early_stopping: Optional[EarlyStopping] = None)
def distil_intermediate_layers_from(
teacher_model: "FARMReader",
data_dir: str,
train_filename: str,
dev_filename: Optional[str] = None,
test_filename: Optional[str] = None,
use_gpu: Optional[bool] = None,
devices: List[torch.device] = [],
batch_size: int = 10,
n_epochs: int = 5,
learning_rate: float = 5e-5,
max_seq_len: Optional[int] = None,
warmup_proportion: float = 0.2,
dev_split: float = 0,
evaluate_every: int = 300,
save_dir: Optional[str] = None,
num_processes: Optional[int] = None,
use_amp: str = None,
checkpoint_root_dir: Path = Path("model_checkpoints"),
checkpoint_every: Optional[int] = None,
checkpoints_to_keep: int = 3,
caching: bool = False,
cache_path: Path = Path("cache/data_silo"),
distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor],
torch.Tensor]] = "mse",
temperature: float = 1.0,
processor: Optional[Processor] = None,
grad_acc_steps: int = 1,
early_stopping: Optional[EarlyStopping] = None)
```
The first stage of distillation finetuning as described in the TinyBERT paper:
@ -332,7 +432,11 @@ None
#### FARMReader.update\_parameters
```python
def update_parameters(context_window_size: Optional[int] = None, no_ans_boost: Optional[float] = None, return_no_answer: Optional[bool] = None, max_seq_len: Optional[int] = None, doc_stride: Optional[int] = None)
def update_parameters(context_window_size: Optional[int] = None,
no_ans_boost: Optional[float] = None,
return_no_answer: Optional[bool] = None,
max_seq_len: Optional[int] = None,
doc_stride: Optional[int] = None)
```
Hot update parameters of a loaded Reader. It may not to be safe when processing concurrent requests.
@ -356,7 +460,9 @@ Saves the Reader model so that it can be reused at a later point in time.
#### FARMReader.save\_to\_remote
```python
def save_to_remote(repo_id: str, private: Optional[bool] = None, commit_message: str = "Add new model to Hugging Face.")
def save_to_remote(repo_id: str,
private: Optional[bool] = None,
commit_message: str = "Add new model to Hugging Face.")
```
Saves the Reader model to Hugging Face Model Hub with the given model_name. For this to work:
@ -375,7 +481,10 @@ Saves the Reader model to Hugging Face Model Hub with the given model_name. For
#### FARMReader.predict\_batch
```python
def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
def predict_batch(queries: List[str],
documents: Union[List[Document], List[List[Document]]],
top_k: Optional[int] = None,
batch_size: Optional[int] = None)
```
Use loaded QA model to find answers for the queries in the Documents.
@ -405,7 +514,9 @@ Can be a single list of Documents or a list of lists of Documents.
#### FARMReader.predict
```python
def predict(query: str, documents: List[Document], top_k: Optional[int] = None)
def predict(query: str,
documents: List[Document],
top_k: Optional[int] = None)
```
Use loaded QA model to find answers for a query in the supplied list of Document.
@ -442,7 +553,10 @@ Dict containing query and answers
#### FARMReader.eval\_on\_file
```python
def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None, calibrate_conf_scores: bool = False)
def eval_on_file(data_dir: Union[Path, str],
test_filename: str,
device: Optional[Union[str, torch.device]] = None,
calibrate_conf_scores: bool = False)
```
Performs evaluation on a SQuAD-formatted file.
@ -466,7 +580,12 @@ or use the Reader's device by default.
#### FARMReader.eval
```python
def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False)
def eval(document_store: BaseDocumentStore,
device: Optional[Union[str, torch.device]] = None,
label_index: str = "label",
doc_index: str = "eval_document",
label_origin: str = "gold-label",
calibrate_conf_scores: bool = False)
```
Performs evaluation on evaluation documents in the DocumentStore.
@ -492,7 +611,12 @@ or use the Reader's device by default.
#### FARMReader.calibrate\_confidence\_scores
```python
def calibrate_confidence_scores(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label")
def calibrate_confidence_scores(document_store: BaseDocumentStore,
device: Optional[Union[str,
torch.device]] = None,
label_index: str = "label",
doc_index: str = "eval_document",
label_origin: str = "gold_label")
```
Calibrates confidence scores on evaluation documents in the DocumentStore.
@ -512,7 +636,9 @@ or use the Reader's device by default.
#### FARMReader.predict\_on\_texts
```python
def predict_on_texts(question: str, texts: List[str], top_k: Optional[int] = None)
def predict_on_texts(question: str,
texts: List[str],
top_k: Optional[int] = None)
```
Use loaded QA model to find answers for a question in the supplied list of Document.
@ -550,7 +676,13 @@ Dict containing question and answers
```python
@classmethod
def convert_to_onnx(cls, model_name: str, output_path: Path, convert_to_float16: bool = False, quantize: bool = False, task_type: str = "question_answering", opset_version: int = 11)
def convert_to_onnx(cls,
model_name: str,
output_path: Path,
convert_to_float16: bool = False,
quantize: bool = False,
task_type: str = "question_answering",
opset_version: int = 11)
```
Convert a PyTorch BERT model to ONNX format and write to ./onnx-export dir. The converted ONNX model
@ -598,7 +730,20 @@ With this reader, you can directly get predictions via predict()
#### TransformersReader.\_\_init\_\_
```python
def __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", model_version: Optional[str] = None, tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answers: bool = False, max_seq_len: int = 256, doc_stride: int = 128, batch_size: int = 16, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
def __init__(
model_name_or_path: str = "distilbert-base-uncased-distilled-squad",
model_version: Optional[str] = None,
tokenizer: Optional[str] = None,
context_window_size: int = 70,
use_gpu: bool = True,
top_k: int = 10,
top_k_per_candidate: int = 3,
return_no_answers: bool = False,
max_seq_len: int = 256,
doc_stride: int = 128,
batch_size: int = 16,
use_auth_token: Optional[Union[str, bool]] = None,
devices: Optional[List[Union[str, torch.device]]] = None)
```
Load a QA model from Transformers.
@ -647,7 +792,9 @@ parameter is not used and a single cpu device is used for inference.
#### TransformersReader.predict
```python
def predict(query: str, documents: List[Document], top_k: Optional[int] = None)
def predict(query: str,
documents: List[Document],
top_k: Optional[int] = None)
```
Use loaded QA model to find answers for a query in the supplied list of Document.
@ -685,7 +832,10 @@ Dict containing query and answers
#### TransformersReader.predict\_batch
```python
def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
def predict_batch(queries: List[str],
documents: Union[List[Document], List[List[Document]]],
top_k: Optional[int] = None,
batch_size: Optional[int] = None)
```
Use loaded QA model to find answers for the queries in the Documents.
@ -752,7 +902,16 @@ answer = prediction["answers"][0].answer # "10 june 1996"
#### TableReader.\_\_init\_\_
```python
def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answer: bool = False, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq",
model_version: Optional[str] = None,
tokenizer: Optional[str] = None,
use_gpu: bool = True,
top_k: int = 10,
top_k_per_candidate: int = 3,
return_no_answer: bool = False,
max_seq_len: int = 256,
use_auth_token: Optional[Union[str, bool]] = None,
devices: Optional[List[Union[str, torch.device]]] = None)
```
Load a TableQA model from Transformers.
@ -803,7 +962,9 @@ parameter is not used and a single cpu device is used for inference.
#### TableReader.predict
```python
def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
def predict(query: str,
documents: List[Document],
top_k: Optional[int] = None) -> Dict
```
Use loaded TableQA model to find answers for a query in the supplied list of Documents
@ -830,7 +991,10 @@ Dict containing query and answers
#### TableReader.predict\_batch
```python
def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
def predict_batch(queries: List[str],
documents: Union[List[Document], List[List[Document]]],
top_k: Optional[int] = None,
batch_size: Optional[int] = None)
```
Use loaded TableQA model to find answers for the supplied queries in the supplied Documents
@ -890,7 +1054,18 @@ Pros and Cons of RCIReader compared to TableReader:
#### RCIReader.\_\_init\_\_
```python
def __init__(row_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-row", column_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-col", row_model_version: Optional[str] = None, column_model_version: Optional[str] = None, row_tokenizer: Optional[str] = None, column_tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None)
def __init__(row_model_name_or_path:
str = "michaelrglass/albert-base-rci-wikisql-row",
column_model_name_or_path:
str = "michaelrglass/albert-base-rci-wikisql-col",
row_model_version: Optional[str] = None,
column_model_version: Optional[str] = None,
row_tokenizer: Optional[str] = None,
column_tokenizer: Optional[str] = None,
use_gpu: bool = True,
top_k: int = 10,
max_seq_len: int = 256,
use_auth_token: Optional[Union[str, bool]] = None)
```
Load an RCI model from Transformers.
@ -926,7 +1101,9 @@ https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrai
#### RCIReader.predict
```python
def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
def predict(query: str,
documents: List[Document],
top_k: Optional[int] = None) -> Dict
```
Use loaded RCI models to find answers for a query in the supplied list of Documents

View File

@ -28,7 +28,13 @@ Base class for regular retrievers.
```python
@abstractmethod
def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
def retrieve(query: str,
filters: Optional[Dict[str, Union[Dict, List, str, int, float,
bool]]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
scale_score: bool = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@ -61,7 +67,13 @@ Wrapper method used to time functions.
#### BaseRetriever.eval
```python
def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False, headers: Optional[Dict[str, str]] = None) -> dict
def eval(label_index: str = "label",
doc_index: str = "eval_document",
label_origin: str = "gold-label",
top_k: int = 10,
open_domain: bool = False,
return_preds: bool = False,
headers: Optional[Dict[str, str]] = None) -> dict
```
Performs evaluation on the Retriever.
@ -110,7 +122,11 @@ class BM25Retriever(BaseRetriever)
#### BM25Retriever.\_\_init\_\_
```python
def __init__(document_store: KeywordDocumentStore, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None, scale_score: bool = True)
def __init__(document_store: KeywordDocumentStore,
top_k: int = 10,
all_terms_must_match: bool = False,
custom_query: Optional[str] = None,
scale_score: bool = True)
```
**Arguments**:
@ -194,7 +210,13 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
#### BM25Retriever.retrieve
```python
def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
def retrieve(query: str,
filters: Optional[Dict[str, Union[Dict, List, str, int, float,
bool]]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
scale_score: bool = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@ -280,12 +302,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
#### BM25Retriever.retrieve\_batch
```python
def retrieve_batch(queries: List[str], filters: Optional[
Union[
Dict[str, Union[Dict, List, str, int, float, bool]],
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
]
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
def retrieve_batch(queries: List[str],
filters: Optional[Union[Dict[str, Union[Dict, List, str,
int, float, bool]],
List[Dict[str,
Union[Dict, List, str,
int, float,
bool]]], ]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
batch_size: Optional[int] = None,
scale_score: bool = None) -> List[List[Document]]
```
Scan through documents in DocumentStore and return a small number documents
@ -386,7 +414,12 @@ Helpful for benchmarking, testing and if you want to do QA on small documents wi
#### FilterRetriever.retrieve
```python
def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
def retrieve(query: str,
filters: dict = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
scale_score: bool = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@ -425,7 +458,9 @@ It uses sklearn's TfidfVectorizer to compute a tf-idf matrix.
#### TfidfRetriever.\_\_init\_\_
```python
def __init__(document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True)
def __init__(document_store: BaseDocumentStore,
top_k: int = 10,
auto_fit=True)
```
**Arguments**:
@ -439,12 +474,16 @@ def __init__(document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True)
#### TfidfRetriever.retrieve
```python
def retrieve(query: str, filters: Optional[
Union[
Dict[str, Union[Dict, List, str, int, float, bool]],
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
]
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
def retrieve(query: str,
filters: Optional[Union[Dict[str, Union[Dict, List, str, int,
float, bool]],
List[Dict[str,
Union[Dict, List, str, int,
float, bool]]], ]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
scale_score: bool = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@ -466,7 +505,14 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
#### TfidfRetriever.retrieve\_batch
```python
def retrieve_batch(queries: Union[str, List[str]], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
def retrieve_batch(queries: Union[str, List[str]],
filters: Optional[Dict[str, Union[Dict, List, str, int,
float, bool]]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
batch_size: Optional[int] = None,
scale_score: bool = None) -> List[List[Document]]
```
Scan through documents in DocumentStore and return a small number documents
@ -519,7 +565,25 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que
#### DensePassageRetriever.\_\_init\_\_
```python
def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
def __init__(document_store: BaseDocumentStore,
query_embedding_model: Union[
Path, str] = "facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model: Union[
Path, str] = "facebook/dpr-ctx_encoder-single-nq-base",
model_version: Optional[str] = None,
max_seq_len_query: int = 64,
max_seq_len_passage: int = 256,
top_k: int = 10,
use_gpu: bool = True,
batch_size: int = 16,
embed_title: bool = True,
use_fast_tokenizers: bool = True,
similarity_function: str = "dot_product",
global_loss_buffer_size: int = 150000,
progress_bar: bool = True,
devices: Optional[List[Union[str, torch.device]]] = None,
use_auth_token: Optional[Union[str, bool]] = None,
scale_score: bool = True)
```
Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
@ -587,7 +651,13 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
#### DensePassageRetriever.retrieve
```python
def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
def retrieve(query: str,
filters: Optional[Dict[str, Union[Dict, List, str, int, float,
bool]]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
scale_score: bool = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@ -671,12 +741,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
#### DensePassageRetriever.retrieve\_batch
```python
def retrieve_batch(queries: List[str], filters: Optional[
Union[
Dict[str, Union[Dict, List, str, int, float, bool]],
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
]
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
def retrieve_batch(queries: List[str],
filters: Optional[Union[Dict[str, Union[Dict, List, str,
int, float, bool]],
List[Dict[str,
Union[Dict, List, str,
int, float,
bool]]], ]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
batch_size: Optional[int] = None,
scale_score: bool = None) -> List[List[Document]]
```
Scan through documents in DocumentStore and return a small number documents
@ -802,7 +878,36 @@ Embeddings of documents / passages shape (batch_size, embedding_dim)
#### DensePassageRetriever.train
```python
def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, multiprocessing_strategy: Optional[str] = None, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, early_stopping: Optional[EarlyStopping] = None)
def train(data_dir: str,
train_filename: str,
dev_filename: str = None,
test_filename: str = None,
max_samples: int = None,
max_processes: int = 128,
multiprocessing_strategy: Optional[str] = None,
dev_split: float = 0,
batch_size: int = 2,
embed_title: bool = True,
num_hard_negatives: int = 1,
num_positives: int = 1,
n_epochs: int = 3,
evaluate_every: int = 1000,
n_gpu: int = 1,
learning_rate: float = 1e-5,
epsilon: float = 1e-08,
weight_decay: float = 0.0,
num_warmup_steps: int = 100,
grad_acc_steps: int = 1,
use_amp: str = None,
optimizer_name: str = "AdamW",
optimizer_correct_bias: bool = True,
save_dir: str = "../saved_models/dpr",
query_encoder_save_dir: str = "query_encoder",
passage_encoder_save_dir: str = "passage_encoder",
checkpoint_root_dir: Path = Path("model_checkpoints"),
checkpoint_every: Optional[int] = None,
checkpoints_to_keep: int = 3,
early_stopping: Optional[EarlyStopping] = None)
```
train a DensePassageRetrieval model
@ -856,7 +961,9 @@ If any checkpoints are stored, a subsequent run of train() will resume training
#### DensePassageRetriever.save
```python
def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder")
def save(save_dir: Union[Path, str],
query_encoder_dir: str = "query_encoder",
passage_encoder_dir: str = "passage_encoder")
```
Save DensePassageRetriever to the specified directory.
@ -877,7 +984,18 @@ None
```python
@classmethod
def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder")
def load(cls,
load_dir: Union[Path, str],
document_store: BaseDocumentStore,
max_seq_len_query: int = 64,
max_seq_len_passage: int = 256,
use_gpu: bool = True,
batch_size: int = 16,
embed_title: bool = True,
use_fast_tokenizers: bool = True,
similarity_function: str = "dot_product",
query_encoder_dir: str = "query_encoder",
passage_encoder_dir: str = "passage_encoder")
```
Load DensePassageRetriever from the specified directory.
@ -901,7 +1019,30 @@ Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using
#### TableTextRetriever.\_\_init\_\_
```python
def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, use_fast: bool = True)
def __init__(
document_store: BaseDocumentStore,
query_embedding_model: Union[
Path, str] = "deepset/bert-small-mm_retrieval-question_encoder",
passage_embedding_model: Union[
Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder",
table_embedding_model: Union[
Path, str] = "deepset/bert-small-mm_retrieval-table_encoder",
model_version: Optional[str] = None,
max_seq_len_query: int = 64,
max_seq_len_passage: int = 256,
max_seq_len_table: int = 256,
top_k: int = 10,
use_gpu: bool = True,
batch_size: int = 16,
embed_meta_fields: List[str] = ["name", "section_title", "caption"],
use_fast_tokenizers: bool = True,
similarity_function: str = "dot_product",
global_loss_buffer_size: int = 150000,
progress_bar: bool = True,
devices: Optional[List[Union[str, torch.device]]] = None,
use_auth_token: Optional[Union[str, bool]] = None,
scale_score: bool = True,
use_fast: bool = True)
```
Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
@ -956,12 +1097,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
#### TableTextRetriever.retrieve\_batch
```python
def retrieve_batch(queries: List[str], filters: Optional[
Union[
Dict[str, Union[Dict, List, str, int, float, bool]],
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
]
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
def retrieve_batch(queries: List[str],
filters: Optional[Union[Dict[str, Union[Dict, List, str,
int, float, bool]],
List[Dict[str,
Union[Dict, List, str,
int, float,
bool]]], ]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
batch_size: Optional[int] = None,
scale_score: bool = None) -> List[List[Document]]
```
Scan through documents in DocumentStore and return a small number documents
@ -1090,7 +1237,38 @@ Embeddings of documents / passages. Shape: (batch_size, embedding_dim)
#### TableTextRetriever.train
```python
def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_meta_fields: List[str] = ["page_title", "section_title", "caption"], num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/mm_retrieval", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", table_encoder_save_dir: str = "table_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, early_stopping: Optional[EarlyStopping] = None)
def train(data_dir: str,
train_filename: str,
dev_filename: str = None,
test_filename: str = None,
max_samples: int = None,
max_processes: int = 128,
dev_split: float = 0,
batch_size: int = 2,
embed_meta_fields: List[str] = [
"page_title", "section_title", "caption"
],
num_hard_negatives: int = 1,
num_positives: int = 1,
n_epochs: int = 3,
evaluate_every: int = 1000,
n_gpu: int = 1,
learning_rate: float = 1e-5,
epsilon: float = 1e-08,
weight_decay: float = 0.0,
num_warmup_steps: int = 100,
grad_acc_steps: int = 1,
use_amp: str = None,
optimizer_name: str = "AdamW",
optimizer_correct_bias: bool = True,
save_dir: str = "../saved_models/mm_retrieval",
query_encoder_save_dir: str = "query_encoder",
passage_encoder_save_dir: str = "passage_encoder",
table_encoder_save_dir: str = "table_encoder",
checkpoint_root_dir: Path = Path("model_checkpoints"),
checkpoint_every: Optional[int] = None,
checkpoints_to_keep: int = 3,
early_stopping: Optional[EarlyStopping] = None)
```
Train a TableTextRetrieval model.
@ -1144,7 +1322,10 @@ checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is cr
#### TableTextRetriever.save
```python
def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder")
def save(save_dir: Union[Path, str],
query_encoder_dir: str = "query_encoder",
passage_encoder_dir: str = "passage_encoder",
table_encoder_dir: str = "table_encoder")
```
Save TableTextRetriever to the specified directory.
@ -1166,7 +1347,20 @@ None
```python
@classmethod
def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder")
def load(cls,
load_dir: Union[Path, str],
document_store: BaseDocumentStore,
max_seq_len_query: int = 64,
max_seq_len_passage: int = 256,
max_seq_len_table: int = 256,
use_gpu: bool = True,
batch_size: int = 16,
embed_meta_fields: List[str] = ["name", "section_title", "caption"],
use_fast_tokenizers: bool = True,
similarity_function: str = "dot_product",
query_encoder_dir: str = "query_encoder",
passage_encoder_dir: str = "passage_encoder",
table_encoder_dir: str = "table_encoder")
```
Load TableTextRetriever from the specified directory.
@ -1184,7 +1378,21 @@ class EmbeddingRetriever(BaseRetriever)
#### EmbeddingRetriever.\_\_init\_\_
```python
def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: Optional[str] = None, pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = [])
def __init__(document_store: BaseDocumentStore,
embedding_model: str,
model_version: Optional[str] = None,
use_gpu: bool = True,
batch_size: int = 32,
max_seq_len: int = 512,
model_format: Optional[str] = None,
pooling_strategy: str = "reduce_mean",
emb_extraction_layer: int = -1,
top_k: int = 10,
progress_bar: bool = True,
devices: Optional[List[Union[str, torch.device]]] = None,
use_auth_token: Optional[Union[str, bool]] = None,
scale_score: bool = True,
embed_meta_fields: List[str] = [])
```
**Arguments**:
@ -1239,7 +1447,13 @@ performance if your titles contain meaningful information for retrieval
#### EmbeddingRetriever.retrieve
```python
def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
def retrieve(query: str,
filters: Optional[Dict[str, Union[Dict, List, str, int, float,
bool]]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
scale_score: bool = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@ -1323,12 +1537,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
#### EmbeddingRetriever.retrieve\_batch
```python
def retrieve_batch(queries: List[str], filters: Optional[
Union[
Dict[str, Union[Dict, List, str, int, float, bool]],
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
]
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
def retrieve_batch(queries: List[str],
filters: Optional[Union[Dict[str, Union[Dict, List, str,
int, float, bool]],
List[Dict[str,
Union[Dict, List, str,
int, float,
bool]]], ]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
batch_size: Optional[int] = None,
scale_score: bool = None) -> List[List[Document]]
```
Scan through documents in DocumentStore and return a small number documents
@ -1454,7 +1674,11 @@ Embeddings, one per input document
#### EmbeddingRetriever.train
```python
def train(training_data: List[Dict[str, Any]], learning_rate: float = 2e-5, n_epochs: int = 1, num_warmup_steps: int = None, batch_size: int = 16) -> None
def train(training_data: List[Dict[str, Any]],
learning_rate: float = 2e-5,
n_epochs: int = 1,
num_warmup_steps: int = None,
batch_size: int = 16) -> None
```
Trains/adapts the underlying embedding model.
@ -1507,7 +1731,22 @@ Xiong, Wenhan, et. al. (2020): "Answering complex open-domain questions with mul
#### MultihopEmbeddingRetriever.\_\_init\_\_
```python
def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, num_iterations: int = 2, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = [])
def __init__(document_store: BaseDocumentStore,
embedding_model: str,
model_version: Optional[str] = None,
num_iterations: int = 2,
use_gpu: bool = True,
batch_size: int = 32,
max_seq_len: int = 512,
model_format: str = "farm",
pooling_strategy: str = "reduce_mean",
emb_extraction_layer: int = -1,
top_k: int = 10,
progress_bar: bool = True,
devices: Optional[List[Union[str, torch.device]]] = None,
use_auth_token: Optional[Union[str, bool]] = None,
scale_score: bool = True,
embed_meta_fields: List[str] = [])
```
**Arguments**:
@ -1563,7 +1802,13 @@ performance if your titles contain meaningful information for retrieval
#### MultihopEmbeddingRetriever.retrieve
```python
def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
def retrieve(query: str,
filters: Optional[Dict[str, Union[Dict, List, str, int, float,
bool]]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
scale_score: bool = None) -> List[Document]
```
Scan through documents in DocumentStore and return a small number documents
@ -1647,12 +1892,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
#### MultihopEmbeddingRetriever.retrieve\_batch
```python
def retrieve_batch(queries: List[str], filters: Optional[
Union[
Dict[str, Union[Dict, List, str, int, float, bool]],
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
]
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
def retrieve_batch(queries: List[str],
filters: Optional[Union[Dict[str, Union[Dict, List, str,
int, float, bool]],
List[Dict[str,
Union[Dict, List, str,
int, float,
bool]]], ]] = None,
top_k: Optional[int] = None,
index: str = None,
headers: Optional[Dict[str, str]] = None,
batch_size: Optional[int] = None,
scale_score: bool = None) -> List[List[Document]]
```
Scan through documents in DocumentStore and return a small number documents
@ -1759,7 +2010,10 @@ The generated SPARQL query is executed on a knowledge graph.
#### Text2SparqlRetriever.\_\_init\_\_
```python
def __init__(knowledge_graph, model_name_or_path, top_k: int = 1, use_auth_token: Optional[Union[str, bool]] = None)
def __init__(knowledge_graph,
model_name_or_path,
top_k: int = 1,
use_auth_token: Optional[Union[str, bool]] = None)
```
Init the Retriever by providing a knowledge graph and a pre-trained BART model

View File

@ -18,7 +18,8 @@ Abstract class for Summarizer
```python
@abstractmethod
def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document]
def predict(documents: List[Document],
generate_single_summary: Optional[bool] = None) -> List[Document]
```
Abstract method for creating a summary.
@ -87,7 +88,19 @@ See the up-to-date list of available models on
#### TransformersSummarizer.\_\_init\_\_
```python
def __init__(model_name_or_path: str = "google/pegasus-xsum", model_version: Optional[str] = None, tokenizer: Optional[str] = None, max_length: int = 200, min_length: int = 5, use_gpu: bool = True, clean_up_tokenization_spaces: bool = True, separator_for_single_summary: str = " ", generate_single_summary: bool = False, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
def __init__(model_name_or_path: str = "google/pegasus-xsum",
model_version: Optional[str] = None,
tokenizer: Optional[str] = None,
max_length: int = 200,
min_length: int = 5,
use_gpu: bool = True,
clean_up_tokenization_spaces: bool = True,
separator_for_single_summary: str = " ",
generate_single_summary: bool = False,
batch_size: int = 16,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
devices: Optional[List[Union[str, torch.device]]] = None)
```
Load a Summarization model from Transformers.
@ -129,7 +142,8 @@ parameter is not used and a single cpu device is used for inference.
#### TransformersSummarizer.predict
```python
def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document]
def predict(documents: List[Document],
generate_single_summary: Optional[bool] = None) -> List[Document]
```
Produce the summarization from the supplied documents.
@ -154,7 +168,11 @@ the original, not summarized text
#### TransformersSummarizer.predict\_batch
```python
def predict_batch(documents: Union[List[Document], List[List[Document]]], generate_single_summary: Optional[bool] = None, batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]]
def predict_batch(
documents: Union[List[Document], List[List[Document]]],
generate_single_summary: Optional[bool] = None,
batch_size: Optional[int] = None
) -> Union[List[Document], List[List[Document]]]
```
Produce the summarization from the supplied documents.

View File

@ -18,7 +18,13 @@ Abstract class for a Translator component that translates either a query or a do
```python
@abstractmethod
def translate(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]
def translate(
results: List[Dict[str, Any]] = None,
query: Optional[str] = None,
documents: Optional[Union[List[Document], List[Answer], List[str],
List[Dict[str, Any]]]] = None,
dict_key: Optional[str] = None
) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]
```
Translate the passed query or a list of documents from language A to B.
@ -28,7 +34,12 @@ Translate the passed query or a list of documents from language A to B.
#### BaseTranslator.run
```python
def run(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None)
def run(results: List[Dict[str, Any]] = None,
query: Optional[str] = None,
documents: Optional[Union[List[Document], List[Answer], List[str],
List[Dict[str, Any]]]] = None,
answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
dict_key: Optional[str] = None)
```
Method that gets executed when this class is used as a Node in a Haystack Pipeline
@ -68,7 +79,14 @@ We currently recommend using OPUS models (see __init__() for details)
#### TransformersTranslator.\_\_init\_\_
```python
def __init__(model_name_or_path: str, tokenizer_name: Optional[str] = None, max_seq_len: Optional[int] = None, clean_up_tokenization_spaces: Optional[bool] = True, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
def __init__(model_name_or_path: str,
tokenizer_name: Optional[str] = None,
max_seq_len: Optional[int] = None,
clean_up_tokenization_spaces: Optional[bool] = True,
use_gpu: bool = True,
progress_bar: bool = True,
use_auth_token: Optional[Union[str, bool]] = None,
devices: Optional[List[Union[str, torch.device]]] = None)
```
Initialize the translator with a model that fits your targeted languages. While we support all seq2seq
@ -109,7 +127,13 @@ parameter is not used and a single cpu device is used for inference.
#### TransformersTranslator.translate
```python
def translate(results: Optional[List[Dict[str, Any]]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]
def translate(
results: Optional[List[Dict[str, Any]]] = None,
query: Optional[str] = None,
documents: Optional[Union[List[Document], List[Answer], List[str],
List[Dict[str, Any]]]] = None,
dict_key: Optional[str] = None
) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]
```
Run the actual translation. You can supply a query or a list of documents. Whatever is supplied will be translated.
@ -126,7 +150,14 @@ Run the actual translation. You can supply a query or a list of documents. Whate
#### TransformersTranslator.translate\_batch
```python
def translate_batch(queries: Optional[List[str]] = None, documents: Optional[Union[List[Document], List[Answer], List[List[Document]], List[List[Answer]]]] = None, batch_size: Optional[int] = None) -> List[Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]]
def translate_batch(
queries: Optional[List[str]] = None,
documents: Optional[Union[List[Document], List[Answer],
List[List[Document]],
List[List[Answer]]]] = None,
batch_size: Optional[int] = None
) -> List[Union[str, List[Document], List[Answer], List[str], List[Dict[
str, Any]]]]
```
Run the actual translation. You can supply a single query, a list of queries or a list (of lists) of documents.

View File

@ -7,7 +7,9 @@
#### print\_answers
```python
def print_answers(results: dict, details: str = "all", max_text_len: Optional[int] = None)
def print_answers(results: dict,
details: str = "all",
max_text_len: Optional[int] = None)
```
Utility function to print results of Haystack pipelines
@ -27,7 +29,10 @@ None
#### print\_documents
```python
def print_documents(results: dict, max_text_len: Optional[int] = None, print_name: bool = True, print_meta: bool = False)
def print_documents(results: dict,
max_text_len: Optional[int] = None,
print_name: bool = True,
print_meta: bool = False)
```
Utility that prints a compressed representation of the documents returned by a pipeline.
@ -90,7 +95,12 @@ Convert the export from the labeling UI to the SQuAD format for training.
#### convert\_files\_to\_docs
```python
def convert_files_to_docs(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def convert_files_to_docs(
dir_path: str,
clean_func: Optional[Callable] = None,
split_paragraphs: bool = False,
encoding: Optional[str] = None,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Documents that can be written to a
@ -114,7 +124,13 @@ If you do this, the Document ID will be generated by using the content and the d
#### tika\_convert\_files\_to\_docs
```python
def tika_convert_files_to_docs(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, merge_short: bool = True, merge_lowercase: bool = True, id_hash_keys: Optional[List[str]] = None) -> List[Document]
def tika_convert_files_to_docs(
dir_path: str,
clean_func: Optional[Callable] = None,
split_paragraphs: bool = False,
merge_short: bool = True,
merge_lowercase: bool = True,
id_hash_keys: Optional[List[str]] = None) -> List[Document]
```
Convert all files (.txt, .pdf) in the sub-directories of the given path to Documents that can be written to a
@ -320,7 +336,13 @@ EarlyStopping class instead as long as it implements the method `check_stopping(
#### EarlyStopping.\_\_init\_\_
```python
def __init__(head: int = 0, metric: Union[str, Callable] = "loss", save_dir: Optional[str] = None, mode: Literal["min", "max"] = "min", patience: int = 0, min_delta: float = 0.001, min_evals: int = 0)
def __init__(head: int = 0,
metric: Union[str, Callable] = "loss",
save_dir: Optional[str] = None,
mode: Literal["min", "max"] = "min",
patience: int = 0,
min_delta: float = 0.001,
min_evals: int = 0)
```
**Arguments**:

View File

@ -29,7 +29,7 @@ def launch_tika(sleep=15, delete_existing=False):
_ = subprocess.run([f"docker rm --force {TIKA_CONTAINER_NAME}"], shell=True, stdout=subprocess.DEVNULL)
status = subprocess.run(
[
f"docker start {TIKA_CONTAINER_NAME} > /dev/null 2>&1 || docker run -p 9998:9998 --name {TIKA_CONTAINER_NAME} apache/tika:1.24.1"
f"docker start {TIKA_CONTAINER_NAME} > /dev/null 2>&1 || docker run -p 9998:9998 --name {TIKA_CONTAINER_NAME} apache/tika:1.28.4"
],
shell=True,
)
@ -102,7 +102,7 @@ class TikaConverter(BaseConverter):
if ping.status_code != 200:
raise Exception(
f"Apache Tika server is not reachable at the URL '{tika_url}'. To run it locally"
f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.24.1'"
f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.28.4'"
)
self.tika_url = tika_url
super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

View File

@ -46,7 +46,7 @@ def launch_opensearch(sleep=15, delete_existing=False):
_ = subprocess.run([f"docker rm --force {OPENSEARCH_CONTAINER_NAME}"], shell=True, stdout=subprocess.DEVNULL)
status = subprocess.run(
[
f'docker start {OPENSEARCH_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" --name {OPENSEARCH_CONTAINER_NAME} opensearchproject/opensearch:1.2.4'
f'docker start {OPENSEARCH_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" --name {OPENSEARCH_CONTAINER_NAME} opensearchproject/opensearch:1.3.5'
],
shell=True,
)
@ -65,7 +65,7 @@ def launch_weaviate(sleep=15):
logger.debug("Starting Weaviate ...")
status = subprocess.run(
[
f"docker start {WEAVIATE_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --name {WEAVIATE_CONTAINER_NAME} semitechnologies/weaviate:1.11.0"
f"docker start {WEAVIATE_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --name {WEAVIATE_CONTAINER_NAME} semitechnologies/weaviate:1.14.0"
],
shell=True,
)

View File

@ -62,9 +62,9 @@ dependencies = [
"mmh3", # fast hashing function (murmurhash3)
"quantulum3", # quantities extraction from text
"posthog", # telemetry
"azure-ai-formrecognizer==3.2.0b2", # forms reader
"azure-ai-formrecognizer>=3.2.0b2", # forms reader
# audio's espnet-model-zoo requires huggingface-hub version <0.8 while we need >=0.5 to be able to use create_repo in FARMReader
"huggingface-hub<0.8.0,>=0.5.0",
"huggingface-hub>=0.5.0",
# Preprocessing
"more_itertools", # for windowing
@ -168,9 +168,9 @@ preprocessing = [
"python-magic-bin; platform_system == 'Windows'", # Needs to be installed without python-magic, otherwise Windows CI gets stuck.
]
ocr = [
"pytesseract==0.3.7",
"pytesseract>0.3.7",
"pillow",
"pdf2image==1.14.0",
"pdf2image>1.14",
]
onnx = [
"onnxruntime",
@ -207,14 +207,7 @@ dev = [
# Code formatting
"black[jupyter]==22.6.0",
# Documentation
"pydoc-markdown==4.5.1", # FIXME Unpin!
# azure-core is a dependency of azure-ai-formrecognizer
# In order to stop malicious pip backtracking during pip install farm-haystack[all] documented in https://github.com/deepset-ai/haystack/issues/2280
# we have to resolve a dependency version conflict ourself.
# azure-core>=1.23 conflicts with pydoc-markdown's dependency on databind>=1.5.0 which itself requires typing-extensions<4.0.0
# azure-core>=1.23 needs typing-extensions>=4.0.1
# pip unfortunately backtracks into the databind direction ultimately getting lost.
"azure-core<1.23",
"pydoc-markdown",
"mkdocs",
"jupytercontrib",
"watchdog", # ==1.0.2

View File

@ -423,7 +423,7 @@ def weaviate_fixture():
print("Starting Weaviate servers ...")
status = subprocess.run(["docker rm haystack_test_weaviate"], shell=True)
status = subprocess.run(
["docker run -d --name haystack_test_weaviate -p 8080:8080 semitechnologies/weaviate:1.11.0"], shell=True
["docker run -d --name haystack_test_weaviate -p 8080:8080 semitechnologies/weaviate:1.14.1"], shell=True
)
if status.returncode:
raise Exception("Failed to launch Weaviate. Please check docker container logs.")
@ -460,7 +460,7 @@ def tika_fixture():
raise Exception("Unable to connect Tika. Please check tika endpoint {0}.".format(tika_url))
except:
print("Starting Tika ...")
status = subprocess.run(["docker run -d --name tika -p 9998:9998 apache/tika:1.24.1"], shell=True)
status = subprocess.run(["docker run -d --name tika -p 9998:9998 apache/tika:1.28.4"], shell=True)
if status.returncode:
raise Exception("Failed to launch Tika. Please check docker container logs.")
time.sleep(30)