mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-26 22:48:29 +00:00
refactor: update dependencies and remove pins (#3147)
* refactor: remove azure-core, pydoc and hf-hub pins * fix: remove extra-comma * fix: force minimum version of azure forms recognizer * refactor: allow newer ocr libs * refactor: update more dependencies and container versions * refactor: remove extra comment * docs: pre-commit manual run * refactor: remove unnecessary dependency * tests: update weaviate container image version
This commit is contained in:
parent
b07fcb7185
commit
e1f399284f
2
.github/utils/tutorials.sh
vendored
2
.github/utils/tutorials.sh
vendored
@ -39,7 +39,7 @@ done
|
||||
|
||||
# Run the containers
|
||||
docker run -d -p 9200:9200 --name elasticsearch -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
|
||||
docker run -d -p 9998:9998 --name tika -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1
|
||||
docker run -d -p 9998:9998 --name tika -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4
|
||||
|
||||
|
||||
failed=""
|
||||
|
||||
4
.github/workflows/tests.yml
vendored
4
.github/workflows/tests.yml
vendored
@ -656,7 +656,7 @@ jobs:
|
||||
|
||||
- name: Run Opensearch
|
||||
run: |
|
||||
docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.2.4
|
||||
docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.3.5
|
||||
|
||||
- name: Run Milvus
|
||||
run: |
|
||||
@ -672,7 +672,7 @@ jobs:
|
||||
run: docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11
|
||||
|
||||
- name: Run Apache Tika
|
||||
run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1
|
||||
run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4
|
||||
|
||||
- name: Run Parsr
|
||||
run: docker run -d -p 3001:3001 axarev/parsr:v1.2.2
|
||||
|
||||
2
.github/workflows/tutorials_nightly.yml
vendored
2
.github/workflows/tutorials_nightly.yml
vendored
@ -28,7 +28,7 @@ jobs:
|
||||
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
|
||||
|
||||
- name: Run Apache Tika
|
||||
run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1
|
||||
run: docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4
|
||||
|
||||
- name: Run GraphDB
|
||||
run: docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11
|
||||
|
||||
@ -199,7 +199,7 @@ docker run -d -p 8080:8080 --name haystack_test_weaviate --env AUTHENTICATION_AN
|
||||
docker run -d -p 7200:7200 --name haystack_test_graphdb deepset/graphdb-free:9.4.1-adoptopenjdk11
|
||||
|
||||
# Tika
|
||||
docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.24.1
|
||||
docker run -d -p 9998:9998 -e "TIKA_CHILD_JAVA_OPTS=-JXms128m" -e "TIKA_CHILD_JAVA_OPTS=-JXmx128m" apache/tika:1.28.4
|
||||
```
|
||||
|
||||
Tests can be also run **individually**:
|
||||
|
||||
@ -27,7 +27,17 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
|
||||
#### Crawler.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None, webdriver_options: Optional[List[str]] = None)
|
||||
def __init__(output_dir: str,
|
||||
urls: Optional[List[str]] = None,
|
||||
crawler_depth: int = 1,
|
||||
filter_urls: Optional[List] = None,
|
||||
overwrite_existing_files=True,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text=True,
|
||||
loading_wait_time: Optional[int] = None,
|
||||
crawler_naming_function: Optional[Callable[[str, str],
|
||||
str]] = None,
|
||||
webdriver_options: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
Init object with basic params for crawling (can be overwritten later).
|
||||
@ -73,7 +83,17 @@ See [Chrome Web Driver Options](https://selenium-python.readthedocs.io/api.html#
|
||||
#### Crawler.crawl
|
||||
|
||||
```python
|
||||
def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> List[Path]
|
||||
def crawl(
|
||||
output_dir: Union[str, Path, None] = None,
|
||||
urls: Optional[List[str]] = None,
|
||||
crawler_depth: Optional[int] = None,
|
||||
filter_urls: Optional[List] = None,
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text: Optional[bool] = None,
|
||||
loading_wait_time: Optional[int] = None,
|
||||
crawler_naming_function: Optional[Callable[[str, str], str]] = None
|
||||
) -> List[Path]
|
||||
```
|
||||
|
||||
Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
|
||||
@ -116,7 +136,18 @@ List of paths where the crawled webpages got stored
|
||||
#### Crawler.run
|
||||
|
||||
```python
|
||||
def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
|
||||
def run(
|
||||
output_dir: Union[str, Path, None] = None,
|
||||
urls: Optional[List[str]] = None,
|
||||
crawler_depth: Optional[int] = None,
|
||||
filter_urls: Optional[List] = None,
|
||||
overwrite_existing_files: Optional[bool] = None,
|
||||
return_documents: Optional[bool] = False,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
extract_hidden_text: Optional[bool] = True,
|
||||
loading_wait_time: Optional[int] = None,
|
||||
crawler_naming_function: Optional[Callable[[str, str], str]] = None
|
||||
) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str]
|
||||
```
|
||||
|
||||
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
|
||||
|
||||
@ -84,7 +84,19 @@ With this document_classifier, you can directly get predictions via predict()
|
||||
#### TransformersDocumentClassifier.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: str = "bhadresh-savani/distilbert-base-uncased-emotion", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, return_all_scores: bool = False, task: str = "text-classification", labels: Optional[List[str]] = None, batch_size: int = 16, classification_field: str = None, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
def __init__(model_name_or_path:
|
||||
str = "bhadresh-savani/distilbert-base-uncased-emotion",
|
||||
model_version: Optional[str] = None,
|
||||
tokenizer: Optional[str] = None,
|
||||
use_gpu: bool = True,
|
||||
return_all_scores: bool = False,
|
||||
task: str = "text-classification",
|
||||
labels: Optional[List[str]] = None,
|
||||
batch_size: int = 16,
|
||||
classification_field: str = None,
|
||||
progress_bar: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
Load a text classification model from Transformers.
|
||||
@ -132,7 +144,8 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### TransformersDocumentClassifier.predict
|
||||
|
||||
```python
|
||||
def predict(documents: List[Document], batch_size: Optional[int] = None) -> List[Document]
|
||||
def predict(documents: List[Document],
|
||||
batch_size: Optional[int] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Returns documents containing classification result in a meta field.
|
||||
@ -153,7 +166,10 @@ A list of Documents enriched with meta information.
|
||||
#### TransformersDocumentClassifier.predict\_batch
|
||||
|
||||
```python
|
||||
def predict_batch(documents: Union[List[Document], List[List[Document]]], batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]]
|
||||
def predict_batch(
|
||||
documents: Union[List[Document], List[List[Document]]],
|
||||
batch_size: Optional[int] = None
|
||||
) -> Union[List[Document], List[List[Document]]]
|
||||
```
|
||||
|
||||
Returns documents containing classification result in meta field.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -40,7 +40,9 @@ When False, correct retrieval is evaluated based on document_id.
|
||||
#### EvalDocuments.run
|
||||
|
||||
```python
|
||||
def run(documents: List[Document], labels: List[Label], top_k: Optional[int] = None)
|
||||
def run(documents: List[Document],
|
||||
labels: List[Label],
|
||||
top_k: Optional[int] = None)
|
||||
```
|
||||
|
||||
Run this node on one sample and its labels
|
||||
@ -78,7 +80,10 @@ Please use pipeline.eval() instead.
|
||||
#### EvalAnswers.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(skip_incorrect_retrieval: bool = True, open_domain: bool = True, sas_model: str = None, debug: bool = False)
|
||||
def __init__(skip_incorrect_retrieval: bool = True,
|
||||
open_domain: bool = True,
|
||||
sas_model: str = None,
|
||||
debug: bool = False)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -123,7 +128,15 @@ Print the evaluation results
|
||||
#### semantic\_answer\_similarity
|
||||
|
||||
```python
|
||||
def semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", batch_size: int = 32, use_gpu: bool = True, use_auth_token: Optional[Union[str, bool]] = None) -> Tuple[List[float], List[float], List[List[float]]]
|
||||
def semantic_answer_similarity(
|
||||
predictions: List[List[str]],
|
||||
gold_labels: List[List[str]],
|
||||
sas_model_name_or_path:
|
||||
str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
||||
batch_size: int = 32,
|
||||
use_gpu: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None
|
||||
) -> Tuple[List[float], List[float], List[List[float]]]
|
||||
```
|
||||
|
||||
Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.
|
||||
|
||||
@ -39,7 +39,9 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### EntityExtractor.run
|
||||
|
||||
```python
|
||||
def run(documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str]
|
||||
def run(
|
||||
documents: Optional[Union[List[Document], List[dict]]] = None
|
||||
) -> Tuple[Dict, str]
|
||||
```
|
||||
|
||||
This is the method called when this node is used in a pipeline
|
||||
@ -59,7 +61,8 @@ This function can be called to perform entity extraction when using the node in
|
||||
#### EntityExtractor.extract\_batch
|
||||
|
||||
```python
|
||||
def extract_batch(texts: Union[List[str], List[List[str]]], batch_size: Optional[int] = None)
|
||||
def extract_batch(texts: Union[List[str], List[List[str]]],
|
||||
batch_size: Optional[int] = None)
|
||||
```
|
||||
|
||||
This function allows to extract entities out of a list of strings or a list of lists of strings.
|
||||
|
||||
@ -33,7 +33,8 @@ Lists with duplicate elements are not allowed.
|
||||
#### FileTypeClassifier.run
|
||||
|
||||
```python
|
||||
def run(file_paths: Union[Path, List[Path], str, List[str], List[Union[Path, str]]])
|
||||
def run(file_paths: Union[Path, List[Path], str, List[str], List[Union[Path,
|
||||
str]]])
|
||||
```
|
||||
|
||||
Sends out files on a different output edge depending on their extension.
|
||||
|
||||
@ -17,7 +17,10 @@ Base class for implementing file converts to transform input documents to text f
|
||||
#### BaseConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True)
|
||||
def __init__(remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
progress_bar: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -44,7 +47,12 @@ In this case the id will be generated by using the content and the defined metad
|
||||
|
||||
```python
|
||||
@abstractmethod
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, Any]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, Any]],
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
@ -77,7 +85,8 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### BaseConverter.validate\_language
|
||||
|
||||
```python
|
||||
def validate_language(text: str, valid_languages: Optional[List[str]] = None) -> bool
|
||||
def validate_language(text: str,
|
||||
valid_languages: Optional[List[str]] = None) -> bool
|
||||
```
|
||||
|
||||
Validate if the language of the text is one of valid languages.
|
||||
@ -87,7 +96,14 @@ Validate if the language of the text is one of valid languages.
|
||||
#### BaseConverter.run
|
||||
|
||||
```python
|
||||
def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None)
|
||||
def run(file_paths: Union[Path, List[Path]],
|
||||
meta: Optional[Union[Dict[str, str],
|
||||
List[Optional[Dict[str, str]]]]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
known_ligatures: Dict[str, str] = KNOWN_LIGATURES,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
Extract text from a file.
|
||||
@ -137,7 +153,12 @@ class DocxToTextConverter(BaseConverter)
|
||||
#### DocxToTextConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text from a .docx file.
|
||||
@ -182,7 +203,9 @@ class ImageToTextConverter(BaseConverter)
|
||||
#### ImageToTextConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None)
|
||||
def __init__(remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = ["eng"],
|
||||
id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -209,7 +232,12 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### ImageToTextConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Union[Path, str],
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
|
||||
@ -252,7 +280,12 @@ class MarkdownConverter(BaseConverter)
|
||||
#### MarkdownConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Reads text from a txt file and executes optional preprocessing steps.
|
||||
@ -301,7 +334,10 @@ class PDFToTextConverter(BaseConverter)
|
||||
#### PDFToTextConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8")
|
||||
def __init__(remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8")
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -329,7 +365,12 @@ Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts,
|
||||
#### PDFToTextConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html)
|
||||
@ -369,7 +410,9 @@ class PDFToTextOCRConverter(BaseConverter)
|
||||
#### PDFToTextOCRConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None)
|
||||
def __init__(remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = ["eng"],
|
||||
id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
|
||||
@ -396,7 +439,12 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### PDFToTextOCRConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
@ -446,7 +494,17 @@ Supported file formats are: PDF, DOCX
|
||||
#### ParsrConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(parsr_url: str = "http://localhost:3001", extractor: Literal["pdfminer", "pdfjs"] = "pdfminer", table_detection_mode: Literal["lattice", "stream"] = "lattice", preceding_context_len: int = 3, following_context_len: int = 3, remove_page_headers: bool = False, remove_page_footers: bool = False, remove_table_of_contents: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True)
|
||||
def __init__(parsr_url: str = "http://localhost:3001",
|
||||
extractor: Literal["pdfminer", "pdfjs"] = "pdfminer",
|
||||
table_detection_mode: Literal["lattice", "stream"] = "lattice",
|
||||
preceding_context_len: int = 3,
|
||||
following_context_len: int = 3,
|
||||
remove_page_headers: bool = False,
|
||||
remove_page_footers: bool = False,
|
||||
remove_table_of_contents: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
add_page_number: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -480,7 +538,12 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### ParsrConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text and tables from a PDF or DOCX using the open-source Parsr tool.
|
||||
@ -529,7 +592,16 @@ https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quick
|
||||
#### AzureConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(endpoint: str, credential_key: str, model_id: str = "prebuilt-document", valid_languages: Optional[List[str]] = None, save_json: bool = False, preceding_context_len: int = 3, following_context_len: int = 3, merge_multiple_column_headers: bool = True, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True)
|
||||
def __init__(endpoint: str,
|
||||
credential_key: str,
|
||||
model_id: str = "prebuilt-document",
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
save_json: bool = False,
|
||||
preceding_context_len: int = 3,
|
||||
following_context_len: int = 3,
|
||||
merge_multiple_column_headers: bool = True,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
add_page_number: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -564,7 +636,14 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### AzureConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, pages: Optional[str] = None, known_language: Optional[str] = None) -> List[Document]
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
pages: Optional[str] = None,
|
||||
known_language: Optional[str] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service.
|
||||
@ -596,7 +675,11 @@ See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales
|
||||
#### AzureConverter.convert\_azure\_json
|
||||
|
||||
```python
|
||||
def convert_azure_json(file_path: Path, meta: Optional[Dict[str, Any]] = None, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert_azure_json(
|
||||
file_path: Path,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Extract text and tables from the JSON output of Azure's Form Recognizer service.
|
||||
@ -633,7 +716,10 @@ class TikaConverter(BaseConverter)
|
||||
#### TikaConverter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None)
|
||||
def __init__(tika_url: str = "http://localhost:9998/tika",
|
||||
remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -659,7 +745,12 @@ In this case the id will be generated by using the content and the defined metad
|
||||
#### TikaConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -703,7 +794,12 @@ class TextConverter(BaseConverter)
|
||||
#### TextConverter.convert
|
||||
|
||||
```python
|
||||
def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert(file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "utf-8",
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Reads text from a txt file and executes optional preprocessing steps.
|
||||
|
||||
@ -18,7 +18,8 @@ Abstract class for Generators
|
||||
|
||||
```python
|
||||
@abstractmethod
|
||||
def predict(query: str, documents: List[Document], top_k: Optional[int]) -> Dict
|
||||
def predict(query: str, documents: List[Document],
|
||||
top_k: Optional[int]) -> Dict
|
||||
```
|
||||
|
||||
Abstract method to generate answers.
|
||||
@ -38,7 +39,10 @@ Generated answers plus additional infos in a dict
|
||||
#### BaseGenerator.predict\_batch
|
||||
|
||||
```python
|
||||
def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
|
||||
def predict_batch(queries: List[str],
|
||||
documents: Union[List[Document], List[List[Document]]],
|
||||
top_k: Optional[int] = None,
|
||||
batch_size: Optional[int] = None)
|
||||
```
|
||||
|
||||
Generate the answer to the input queries. The generation will be conditioned on the supplied documents.
|
||||
@ -138,7 +142,20 @@ i.e. the model can easily adjust to domain documents even after training has fin
|
||||
#### RAGenerator.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
def __init__(model_name_or_path: str = "facebook/rag-token-nq",
|
||||
model_version: Optional[str] = None,
|
||||
retriever: Optional[DensePassageRetriever] = None,
|
||||
generator_type: str = "token",
|
||||
top_k: int = 2,
|
||||
max_length: int = 200,
|
||||
min_length: int = 2,
|
||||
num_beams: int = 2,
|
||||
embed_title: bool = True,
|
||||
prefix: Optional[str] = None,
|
||||
use_gpu: bool = True,
|
||||
progress_bar: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
Load a RAG model from Transformers along with passage_embedding_model.
|
||||
@ -176,7 +193,9 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### RAGenerator.predict
|
||||
|
||||
```python
|
||||
def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
|
||||
def predict(query: str,
|
||||
documents: List[Document],
|
||||
top_k: Optional[int] = None) -> Dict
|
||||
```
|
||||
|
||||
Generate the answer to the input query. The generation will be conditioned on the supplied documents.
|
||||
@ -266,7 +285,16 @@ the [Hugging Face Model Hub](https://huggingface.co/models?pipeline_tag=text2tex
|
||||
#### Seq2SeqGenerator.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: str, input_converter: Optional[Callable] = None, top_k: int = 1, max_length: int = 200, min_length: int = 2, num_beams: int = 8, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
def __init__(model_name_or_path: str,
|
||||
input_converter: Optional[Callable] = None,
|
||||
top_k: int = 1,
|
||||
max_length: int = 200,
|
||||
min_length: int = 2,
|
||||
num_beams: int = 8,
|
||||
use_gpu: bool = True,
|
||||
progress_bar: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -298,7 +326,9 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### Seq2SeqGenerator.predict
|
||||
|
||||
```python
|
||||
def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
|
||||
def predict(query: str,
|
||||
documents: List[Document],
|
||||
top_k: Optional[int] = None) -> Dict
|
||||
```
|
||||
|
||||
Generate the answer to the input query. The generation will be conditioned on the supplied documents.
|
||||
@ -338,7 +368,17 @@ on the [OpenAI API website](https://openai.com/api/).
|
||||
#### OpenAIAnswerGenerator.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(api_key: str, model: str = "text-curie-001", max_tokens: int = 7, top_k: int = 5, temperature: int = 0, presence_penalty: float = -2.0, frequency_penalty: float = -2.0, examples_context: Optional[str] = None, examples: Optional[List] = None, stop_words: Optional[List] = None, progress_bar: bool = True)
|
||||
def __init__(api_key: str,
|
||||
model: str = "text-curie-001",
|
||||
max_tokens: int = 7,
|
||||
top_k: int = 5,
|
||||
temperature: int = 0,
|
||||
presence_penalty: float = -2.0,
|
||||
frequency_penalty: float = -2.0,
|
||||
examples_context: Optional[str] = None,
|
||||
examples: Optional[List] = None,
|
||||
stop_words: Optional[List] = None,
|
||||
progress_bar: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -374,7 +414,9 @@ If you don't provide it, the default from OpenAPI docs is used: ["\n", "<|endoft
|
||||
#### OpenAIAnswerGenerator.predict
|
||||
|
||||
```python
|
||||
def predict(query: str, documents: List[Document], top_k: Optional[int] = None)
|
||||
def predict(query: str,
|
||||
documents: List[Document],
|
||||
top_k: Optional[int] = None)
|
||||
```
|
||||
|
||||
Use the loaded QA model to generate Answers for a query based on the Documents it receives.
|
||||
|
||||
@ -45,7 +45,10 @@ The node allows multiple join modes:
|
||||
#### JoinDocuments.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None, sort_by_score: bool = True)
|
||||
def __init__(join_mode: str = "concatenate",
|
||||
weights: Optional[List[float]] = None,
|
||||
top_k_join: Optional[int] = None,
|
||||
sort_by_score: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -79,7 +82,10 @@ A node to join `Answer`s produced by multiple `Reader` nodes.
|
||||
#### JoinAnswers.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None, sort_by_score: bool = True)
|
||||
def __init__(join_mode: str = "concatenate",
|
||||
weights: Optional[List[float]] = None,
|
||||
top_k_join: Optional[int] = None,
|
||||
sort_by_score: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -114,7 +120,8 @@ different nodes.
|
||||
#### RouteDocuments.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(split_by: str = "content_type", metadata_values: Optional[List[str]] = None)
|
||||
def __init__(split_by: str = "content_type",
|
||||
metadata_values: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
@ -42,7 +42,9 @@ Note that this also includes such components that are being utilized by other co
|
||||
#### Pipeline.to\_code
|
||||
|
||||
```python
|
||||
def to_code(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = False) -> str
|
||||
def to_code(pipeline_variable_name: str = "pipeline",
|
||||
generate_imports: bool = True,
|
||||
add_comment: bool = False) -> str
|
||||
```
|
||||
|
||||
Returns the code to create this pipeline as string.
|
||||
@ -61,7 +63,9 @@ Default value is False.
|
||||
#### Pipeline.to\_notebook\_cell
|
||||
|
||||
```python
|
||||
def to_notebook_cell(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = True)
|
||||
def to_notebook_cell(pipeline_variable_name: str = "pipeline",
|
||||
generate_imports: bool = True,
|
||||
add_comment: bool = True)
|
||||
```
|
||||
|
||||
Creates a new notebook cell with the code to create this pipeline.
|
||||
@ -81,7 +85,13 @@ Default value is True.
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def load_from_deepset_cloud(cls, pipeline_config_name: str, pipeline_name: str = "query", workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite_with_env_variables: bool = False)
|
||||
def load_from_deepset_cloud(cls,
|
||||
pipeline_config_name: str,
|
||||
pipeline_name: str = "query",
|
||||
workspace: str = "default",
|
||||
api_key: Optional[str] = None,
|
||||
api_endpoint: Optional[str] = None,
|
||||
overwrite_with_env_variables: bool = False)
|
||||
```
|
||||
|
||||
Load Pipeline from Deepset Cloud defining the individual components and how they're tied together to form
|
||||
@ -114,7 +124,11 @@ variable 'READER_PARAMS_RETURN_NO_ANSWER=False' can be set. Note that an
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def list_pipelines_on_deepset_cloud(cls, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None) -> List[dict]
|
||||
def list_pipelines_on_deepset_cloud(
|
||||
cls,
|
||||
workspace: str = "default",
|
||||
api_key: Optional[str] = None,
|
||||
api_endpoint: Optional[str] = None) -> List[dict]
|
||||
```
|
||||
|
||||
Lists all pipeline configs available on Deepset Cloud.
|
||||
@ -150,7 +164,14 @@ Returns:
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def save_to_deepset_cloud(cls, query_pipeline: Pipeline, index_pipeline: Pipeline, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite: bool = False)
|
||||
def save_to_deepset_cloud(cls,
|
||||
query_pipeline: Pipeline,
|
||||
index_pipeline: Pipeline,
|
||||
pipeline_config_name: str,
|
||||
workspace: str = "default",
|
||||
api_key: Optional[str] = None,
|
||||
api_endpoint: Optional[str] = None,
|
||||
overwrite: bool = False)
|
||||
```
|
||||
|
||||
Saves a Pipeline config to Deepset Cloud defining the individual components and how they're tied together to form
|
||||
@ -175,7 +196,13 @@ If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment varia
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def deploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60, show_curl_message: bool = True)
|
||||
def deploy_on_deepset_cloud(cls,
|
||||
pipeline_config_name: str,
|
||||
workspace: str = "default",
|
||||
api_key: Optional[str] = None,
|
||||
api_endpoint: Optional[str] = None,
|
||||
timeout: int = 60,
|
||||
show_curl_message: bool = True)
|
||||
```
|
||||
|
||||
Deploys the pipelines of a pipeline config on Deepset Cloud.
|
||||
@ -205,7 +232,12 @@ If the timeout is exceeded an error will be raised.
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def undeploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60)
|
||||
def undeploy_on_deepset_cloud(cls,
|
||||
pipeline_config_name: str,
|
||||
workspace: str = "default",
|
||||
api_key: Optional[str] = None,
|
||||
api_endpoint: Optional[str] = None,
|
||||
timeout: int = 60)
|
||||
```
|
||||
|
||||
Undeploys the pipelines of a pipeline config on Deepset Cloud.
|
||||
@ -285,7 +317,13 @@ Set the component for a node in the Pipeline.
|
||||
#### Pipeline.run
|
||||
|
||||
```python
|
||||
def run(query: Optional[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[MultiLabel] = None, documents: Optional[List[Document]] = None, meta: Optional[Union[dict, List[dict]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None)
|
||||
def run(query: Optional[str] = None,
|
||||
file_paths: Optional[List[str]] = None,
|
||||
labels: Optional[MultiLabel] = None,
|
||||
documents: Optional[List[Document]] = None,
|
||||
meta: Optional[Union[dict, List[dict]]] = None,
|
||||
params: Optional[dict] = None,
|
||||
debug: Optional[bool] = None)
|
||||
```
|
||||
|
||||
Runs the Pipeline, one node at a time.
|
||||
@ -310,7 +348,15 @@ the Nodes received and the output they generated. You can then find all debug in
|
||||
#### Pipeline.run\_batch
|
||||
|
||||
```python
|
||||
def run_batch(queries: List[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None, documents: Optional[Union[List[Document], List[List[Document]]]] = None, meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None)
|
||||
def run_batch(queries: List[str] = None,
|
||||
file_paths: Optional[List[str]] = None,
|
||||
labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
|
||||
documents: Optional[Union[List[Document],
|
||||
List[List[Document]]]] = None,
|
||||
meta: Optional[Union[Dict[str, Any], List[Dict[str,
|
||||
Any]]]] = None,
|
||||
params: Optional[dict] = None,
|
||||
debug: Optional[bool] = None)
|
||||
```
|
||||
|
||||
Runs the Pipeline in a batch mode, one node at a time. The batch mode means that the Pipeline can take more than one query as input. You can use this method for query pipelines only. When used with an indexing pipeline, it calls the pipeline `run()` method.
|
||||
@ -346,7 +392,18 @@ the Nodes received and the output they generated. You can then find all debug in
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def eval_beir(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict = {}, query_params: dict = {}, dataset: str = "scifact", dataset_dir: Path = Path("."), top_k_values: List[int] = [1, 3, 5, 10, 100, 1000], keep_index: bool = False) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]
|
||||
def eval_beir(
|
||||
cls,
|
||||
index_pipeline: Pipeline,
|
||||
query_pipeline: Pipeline,
|
||||
index_params: dict = {},
|
||||
query_params: dict = {},
|
||||
dataset: str = "scifact",
|
||||
dataset_dir: Path = Path("."),
|
||||
top_k_values: List[int] = [1, 3, 5, 10, 100, 1000],
|
||||
keep_index: bool = False
|
||||
) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str,
|
||||
float]]
|
||||
```
|
||||
|
||||
Runs information retrieval evaluation of a pipeline using BEIR on a specified BEIR dataset.
|
||||
@ -375,14 +432,38 @@ Each metric is represented by a dictionary containing the scores for each top_k
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def execute_eval_run(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, evaluation_set_labels: List[MultiLabel], corpus_file_paths: List[str], experiment_name: str, experiment_run_name: str, experiment_tracking_tool: Literal["mlflow", None] = None, experiment_tracking_uri: Optional[str] = None, corpus_file_metas: List[Dict[str, Any]] = None, corpus_meta: Dict[str, Any] = {}, evaluation_set_meta: Dict[str, Any] = {}, pipeline_meta: Dict[str, Any] = {}, index_params: dict = {}, query_params: dict = {}, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, use_batch_mode: bool = False, add_isolated_node_eval: bool = False, reuse_index: bool = False, custom_document_id_field: Optional[str] = None, document_scope: Literal[
|
||||
"document_id",
|
||||
"context",
|
||||
"document_id_and_context",
|
||||
"document_id_or_context",
|
||||
"answer",
|
||||
"document_id_or_answer",
|
||||
] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult
|
||||
def execute_eval_run(
|
||||
cls,
|
||||
index_pipeline: Pipeline,
|
||||
query_pipeline: Pipeline,
|
||||
evaluation_set_labels: List[MultiLabel],
|
||||
corpus_file_paths: List[str],
|
||||
experiment_name: str,
|
||||
experiment_run_name: str,
|
||||
experiment_tracking_tool: Literal["mlflow", None] = None,
|
||||
experiment_tracking_uri: Optional[str] = None,
|
||||
corpus_file_metas: List[Dict[str, Any]] = None,
|
||||
corpus_meta: Dict[str, Any] = {},
|
||||
evaluation_set_meta: Dict[str, Any] = {},
|
||||
pipeline_meta: Dict[str, Any] = {},
|
||||
index_params: dict = {},
|
||||
query_params: dict = {},
|
||||
sas_model_name_or_path: str = None,
|
||||
sas_batch_size: int = 32,
|
||||
sas_use_gpu: bool = True,
|
||||
use_batch_mode: bool = False,
|
||||
add_isolated_node_eval: bool = False,
|
||||
reuse_index: bool = False,
|
||||
custom_document_id_field: Optional[str] = None,
|
||||
document_scope: Literal[
|
||||
"document_id", "context", "document_id_and_context",
|
||||
"document_id_or_context", "answer",
|
||||
"document_id_or_answer", ] = "document_id_or_answer",
|
||||
answer_scope: Literal["any", "context", "document_id",
|
||||
"document_id_and_context"] = "any",
|
||||
context_matching_min_length: int = 100,
|
||||
context_matching_boost_split_overlaps: bool = True,
|
||||
context_matching_threshold: float = 65.0) -> EvaluationResult
|
||||
```
|
||||
|
||||
Starts an experiment run that first indexes the specified files (forming a corpus) using the index pipeline
|
||||
@ -510,7 +591,19 @@ Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scori
|
||||
|
||||
```python
|
||||
@send_event
|
||||
def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0, use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult
|
||||
def eval(
|
||||
labels: List[MultiLabel],
|
||||
documents: Optional[List[List[Document]]] = None,
|
||||
params: Optional[dict] = None,
|
||||
sas_model_name_or_path: Optional[str] = None,
|
||||
sas_batch_size: int = 32,
|
||||
sas_use_gpu: bool = True,
|
||||
add_isolated_node_eval: bool = False,
|
||||
custom_document_id_field: Optional[str] = None,
|
||||
context_matching_min_length: int = 100,
|
||||
context_matching_boost_split_overlaps: bool = True,
|
||||
context_matching_threshold: float = 65.0,
|
||||
use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult
|
||||
```
|
||||
|
||||
Evaluates the pipeline by running the pipeline once per query in debug mode
|
||||
@ -576,7 +669,19 @@ https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrai
|
||||
|
||||
```python
|
||||
@send_event
|
||||
def eval_batch(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0, use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult
|
||||
def eval_batch(
|
||||
labels: List[MultiLabel],
|
||||
documents: Optional[List[List[Document]]] = None,
|
||||
params: Optional[dict] = None,
|
||||
sas_model_name_or_path: Optional[str] = None,
|
||||
sas_batch_size: int = 32,
|
||||
sas_use_gpu: bool = True,
|
||||
add_isolated_node_eval: bool = False,
|
||||
custom_document_id_field: Optional[str] = None,
|
||||
context_matching_min_length: int = 100,
|
||||
context_matching_boost_split_overlaps: bool = True,
|
||||
context_matching_threshold: float = 65.0,
|
||||
use_auth_token: Optional[Union[str, bool]] = None) -> EvaluationResult
|
||||
```
|
||||
|
||||
Evaluates the pipeline by running it in batches in the debug mode
|
||||
@ -690,7 +795,11 @@ Create a Graphviz visualization of the pipeline.
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, strict_version_check: bool = False)
|
||||
def load_from_yaml(cls,
|
||||
path: Path,
|
||||
pipeline_name: Optional[str] = None,
|
||||
overwrite_with_env_variables: bool = True,
|
||||
strict_version_check: bool = False)
|
||||
```
|
||||
|
||||
Load Pipeline from a YAML file defining the individual components and how they're tied together to form
|
||||
@ -747,7 +856,11 @@ variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def load_from_config(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, strict_version_check: bool = False)
|
||||
def load_from_config(cls,
|
||||
pipeline_config: Dict,
|
||||
pipeline_name: Optional[str] = None,
|
||||
overwrite_with_env_variables: bool = True,
|
||||
strict_version_check: bool = False)
|
||||
```
|
||||
|
||||
Load Pipeline from a config dict defining the individual components and how they're tied together to form
|
||||
@ -832,14 +945,19 @@ Returns a configuration for the Pipeline that can be used with `Pipeline.load_fr
|
||||
#### Pipeline.print\_eval\_report
|
||||
|
||||
```python
|
||||
def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None, document_scope: Literal[
|
||||
"document_id",
|
||||
"context",
|
||||
"document_id_and_context",
|
||||
"document_id_or_context",
|
||||
"answer",
|
||||
"document_id_or_answer",
|
||||
] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", wrong_examples_fields: List[str] = ["answer", "context", "document_id"], max_characters_per_field: int = 150)
|
||||
def print_eval_report(eval_result: EvaluationResult,
|
||||
n_wrong_examples: int = 3,
|
||||
metrics_filter: Optional[Dict[str, List[str]]] = None,
|
||||
document_scope: Literal[
|
||||
"document_id", "context", "document_id_and_context",
|
||||
"document_id_or_context", "answer",
|
||||
"document_id_or_answer", ] = "document_id_or_answer",
|
||||
answer_scope: Literal["any", "context", "document_id",
|
||||
"document_id_and_context"] = "any",
|
||||
wrong_examples_fields: List[str] = [
|
||||
"answer", "context", "document_id"
|
||||
],
|
||||
max_characters_per_field: int = 150)
|
||||
```
|
||||
|
||||
Prints evaluation report containing a metrics funnel and worst queries for further analysis.
|
||||
@ -892,7 +1010,8 @@ class _HaystackBeirRetrieverAdapter()
|
||||
#### \_HaystackBeirRetrieverAdapter.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict, query_params: dict)
|
||||
def __init__(index_pipeline: Pipeline, query_pipeline: Pipeline,
|
||||
index_params: dict, query_params: dict)
|
||||
```
|
||||
|
||||
Adapter mimicking a BEIR retriever used by BEIR's EvaluateRetrieval class to run BEIR evaluations on Haystack Pipelines.
|
||||
@ -959,7 +1078,9 @@ YAML definitions of Ray pipelines are validated at load. For more information, s
|
||||
#### RayPipeline.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(address: str = None, ray_args: Optional[Dict[str, Any]] = None, serve_args: Optional[Dict[str, Any]] = None)
|
||||
def __init__(address: str = None,
|
||||
ray_args: Optional[Dict[str, Any]] = None,
|
||||
serve_args: Optional[Dict[str, Any]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -974,7 +1095,14 @@ def __init__(address: str = None, ray_args: Optional[Dict[str, Any]] = None, ser
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, address: Optional[str] = None, strict_version_check: bool = False, ray_args: Optional[Dict[str, Any]] = None, serve_args: Optional[Dict[str, Any]] = None)
|
||||
def load_from_yaml(cls,
|
||||
path: Path,
|
||||
pipeline_name: Optional[str] = None,
|
||||
overwrite_with_env_variables: bool = True,
|
||||
address: Optional[str] = None,
|
||||
strict_version_check: bool = False,
|
||||
ray_args: Optional[Dict[str, Any]] = None,
|
||||
serve_args: Optional[Dict[str, Any]] = None)
|
||||
```
|
||||
|
||||
Load Pipeline from a YAML file defining the individual components and how they're tied together to form
|
||||
@ -1189,7 +1317,10 @@ Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True)
|
||||
def load_from_yaml(cls,
|
||||
path: Path,
|
||||
pipeline_name: Optional[str] = None,
|
||||
overwrite_with_env_variables: bool = True)
|
||||
```
|
||||
|
||||
Load Pipeline from a YAML file defining the individual components and how they're tied together to form
|
||||
@ -1277,7 +1408,16 @@ Instance of DocumentStore or None
|
||||
#### BaseStandardPipeline.eval
|
||||
|
||||
```python
|
||||
def eval(labels: List[MultiLabel], params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult
|
||||
def eval(labels: List[MultiLabel],
|
||||
params: Optional[dict] = None,
|
||||
sas_model_name_or_path: Optional[str] = None,
|
||||
sas_batch_size: int = 32,
|
||||
sas_use_gpu: bool = True,
|
||||
add_isolated_node_eval: bool = False,
|
||||
custom_document_id_field: Optional[str] = None,
|
||||
context_matching_min_length: int = 100,
|
||||
context_matching_boost_split_overlaps: bool = True,
|
||||
context_matching_threshold: float = 65.0) -> EvaluationResult
|
||||
```
|
||||
|
||||
Evaluates the pipeline by running the pipeline once per query in debug mode
|
||||
@ -1318,7 +1458,16 @@ Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scori
|
||||
#### BaseStandardPipeline.eval\_batch
|
||||
|
||||
```python
|
||||
def eval_batch(labels: List[MultiLabel], params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult
|
||||
def eval_batch(labels: List[MultiLabel],
|
||||
params: Optional[dict] = None,
|
||||
sas_model_name_or_path: Optional[str] = None,
|
||||
sas_batch_size: int = 32,
|
||||
sas_use_gpu: bool = True,
|
||||
add_isolated_node_eval: bool = False,
|
||||
custom_document_id_field: Optional[str] = None,
|
||||
context_matching_min_length: int = 100,
|
||||
context_matching_boost_split_overlaps: bool = True,
|
||||
context_matching_threshold: float = 65.0) -> EvaluationResult
|
||||
```
|
||||
|
||||
Evaluates the pipeline by running the pipeline once per query in the debug mode
|
||||
@ -1358,14 +1507,19 @@ To calculate SAS (Semantic Answer Similarity) metrics, specify `sas_model_name_o
|
||||
#### BaseStandardPipeline.print\_eval\_report
|
||||
|
||||
```python
|
||||
def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None, document_scope: Literal[
|
||||
"document_id",
|
||||
"context",
|
||||
"document_id_and_context",
|
||||
"document_id_or_context",
|
||||
"answer",
|
||||
"document_id_or_answer",
|
||||
] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", wrong_examples_fields: List[str] = ["answer", "context", "document_id"], max_characters_per_field: int = 150)
|
||||
def print_eval_report(eval_result: EvaluationResult,
|
||||
n_wrong_examples: int = 3,
|
||||
metrics_filter: Optional[Dict[str, List[str]]] = None,
|
||||
document_scope: Literal[
|
||||
"document_id", "context", "document_id_and_context",
|
||||
"document_id_or_context", "answer",
|
||||
"document_id_or_answer", ] = "document_id_or_answer",
|
||||
answer_scope: Literal["any", "context", "document_id",
|
||||
"document_id_and_context"] = "any",
|
||||
wrong_examples_fields: List[str] = [
|
||||
"answer", "context", "document_id"
|
||||
],
|
||||
max_characters_per_field: int = 150)
|
||||
```
|
||||
|
||||
Prints evaluation report containing a metrics funnel and worst queries for further analysis.
|
||||
@ -1410,7 +1564,9 @@ In Question Answering, to enforce that the retrieved document is considered corr
|
||||
#### BaseStandardPipeline.run\_batch
|
||||
|
||||
```python
|
||||
def run_batch(queries: List[str], params: Optional[dict] = None, debug: Optional[bool] = None)
|
||||
def run_batch(queries: List[str],
|
||||
params: Optional[dict] = None,
|
||||
debug: Optional[bool] = None)
|
||||
```
|
||||
|
||||
Run a batch of queries through the pipeline.
|
||||
@ -1454,7 +1610,9 @@ def __init__(reader: BaseReader, retriever: BaseRetriever)
|
||||
#### ExtractiveQAPipeline.run
|
||||
|
||||
```python
|
||||
def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
|
||||
def run(query: str,
|
||||
params: Optional[dict] = None,
|
||||
debug: Optional[bool] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -1495,7 +1653,9 @@ def __init__(retriever: BaseRetriever)
|
||||
#### DocumentSearchPipeline.run
|
||||
|
||||
```python
|
||||
def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
|
||||
def run(query: str,
|
||||
params: Optional[dict] = None,
|
||||
debug: Optional[bool] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -1536,7 +1696,9 @@ def __init__(generator: BaseGenerator, retriever: BaseRetriever)
|
||||
#### GenerativeQAPipeline.run
|
||||
|
||||
```python
|
||||
def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
|
||||
def run(query: str,
|
||||
params: Optional[dict] = None,
|
||||
debug: Optional[bool] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -1565,7 +1727,9 @@ Pipeline that retrieves documents for a query and then summarizes those document
|
||||
#### SearchSummarizationPipeline.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(summarizer: BaseSummarizer, retriever: BaseRetriever, return_in_answer_format: bool = False)
|
||||
def __init__(summarizer: BaseSummarizer,
|
||||
retriever: BaseRetriever,
|
||||
return_in_answer_format: bool = False)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -1581,7 +1745,9 @@ pipeline as a "drop-in replacement" for other QA pipelines.
|
||||
#### SearchSummarizationPipeline.run
|
||||
|
||||
```python
|
||||
def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
|
||||
def run(query: str,
|
||||
params: Optional[dict] = None,
|
||||
debug: Optional[bool] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -1600,7 +1766,9 @@ by this method under the key "_debug"
|
||||
#### SearchSummarizationPipeline.run\_batch
|
||||
|
||||
```python
|
||||
def run_batch(queries: List[str], params: Optional[dict] = None, debug: Optional[bool] = None)
|
||||
def run_batch(queries: List[str],
|
||||
params: Optional[dict] = None,
|
||||
debug: Optional[bool] = None)
|
||||
```
|
||||
|
||||
Run a batch of queries through the pipeline.
|
||||
@ -1643,7 +1811,9 @@ def __init__(retriever: BaseRetriever)
|
||||
#### FAQPipeline.run
|
||||
|
||||
```python
|
||||
def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None)
|
||||
def run(query: str,
|
||||
params: Optional[dict] = None,
|
||||
debug: Optional[bool] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -1672,7 +1842,9 @@ Takes an existing search pipeline and adds one "input translation node" after th
|
||||
#### TranslationWrapperPipeline.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(input_translator: BaseTranslator, output_translator: BaseTranslator, pipeline: BaseStandardPipeline)
|
||||
def __init__(input_translator: BaseTranslator,
|
||||
output_translator: BaseTranslator,
|
||||
pipeline: BaseStandardPipeline)
|
||||
```
|
||||
|
||||
Wrap a given `pipeline` with the `input_translator` and `output_translator`.
|
||||
|
||||
@ -16,7 +16,16 @@ class BasePreProcessor(BaseComponent)
|
||||
|
||||
```python
|
||||
@abstractmethod
|
||||
def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def process(documents: Union[dict, Document, List[Union[dict, Document]]],
|
||||
clean_whitespace: Optional[bool] = True,
|
||||
clean_header_footer: Optional[bool] = False,
|
||||
clean_empty_lines: Optional[bool] = True,
|
||||
remove_substrings: List[str] = [],
|
||||
split_by: Optional[str] = "word",
|
||||
split_length: Optional[int] = 1000,
|
||||
split_overlap: Optional[int] = None,
|
||||
split_respect_sentence_boundary: Optional[bool] = True,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a
|
||||
@ -39,7 +48,19 @@ class PreProcessor(BasePreProcessor)
|
||||
#### PreProcessor.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, tokenizer_model_folder: Optional[Union[str, Path]] = None, language: str = "en", id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True, add_page_number: bool = False)
|
||||
def __init__(clean_whitespace: bool = True,
|
||||
clean_header_footer: bool = False,
|
||||
clean_empty_lines: bool = True,
|
||||
remove_substrings: List[str] = [],
|
||||
split_by: str = "word",
|
||||
split_length: int = 200,
|
||||
split_overlap: int = 0,
|
||||
split_respect_sentence_boundary: bool = True,
|
||||
tokenizer_model_folder: Optional[Union[str, Path]] = None,
|
||||
language: str = "en",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
progress_bar: bool = True,
|
||||
add_page_number: bool = False)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -80,7 +101,16 @@ in between pages by `PDFToTextConverter`, `TikaConverter`, `ParsrConverter` and
|
||||
#### PreProcessor.process
|
||||
|
||||
```python
|
||||
def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def process(documents: Union[dict, Document, List[Union[dict, Document]]],
|
||||
clean_whitespace: Optional[bool] = None,
|
||||
clean_header_footer: Optional[bool] = None,
|
||||
clean_empty_lines: Optional[bool] = None,
|
||||
remove_substrings: List[str] = [],
|
||||
split_by: Optional[str] = None,
|
||||
split_length: Optional[int] = None,
|
||||
split_overlap: Optional[int] = None,
|
||||
split_respect_sentence_boundary: Optional[bool] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
|
||||
@ -90,7 +120,12 @@ Perform document cleaning and splitting. Can take a single document or a list of
|
||||
#### PreProcessor.clean
|
||||
|
||||
```python
|
||||
def clean(document: Union[dict, Document], clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, remove_substrings: List[str], id_hash_keys: Optional[List[str]] = None) -> Document
|
||||
def clean(document: Union[dict, Document],
|
||||
clean_whitespace: bool,
|
||||
clean_header_footer: bool,
|
||||
clean_empty_lines: bool,
|
||||
remove_substrings: List[str],
|
||||
id_hash_keys: Optional[List[str]] = None) -> Document
|
||||
```
|
||||
|
||||
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
|
||||
@ -101,7 +136,12 @@ and empty lines. Its exact functionality is defined by the parameters passed int
|
||||
#### PreProcessor.split
|
||||
|
||||
```python
|
||||
def split(document: Union[dict, Document], split_by: str, split_length: int, split_overlap: int, split_respect_sentence_boundary: bool, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def split(document: Union[dict, Document],
|
||||
split_by: str,
|
||||
split_length: int,
|
||||
split_overlap: int,
|
||||
split_respect_sentence_boundary: bool,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Perform document splitting on a single document. This method can split on different units, at different lengths,
|
||||
|
||||
@ -16,7 +16,13 @@ class Document()
|
||||
#### Document.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image", "audio"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None)
|
||||
def __init__(content: Union[str, pd.DataFrame],
|
||||
content_type: Literal["text", "table", "image", "audio"] = "text",
|
||||
id: Optional[str] = None,
|
||||
score: Optional[float] = None,
|
||||
meta: Dict[str, Any] = None,
|
||||
embedding: Optional[np.ndarray] = None,
|
||||
id_hash_keys: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
|
||||
@ -79,7 +85,10 @@ dict with content of the Document
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def from_dict(cls, dict: Dict[str, Any], field_map: Dict[str, Any] = {}, id_hash_keys: Optional[List[str]] = None) -> Document
|
||||
def from_dict(cls,
|
||||
dict: Dict[str, Any],
|
||||
field_map: Dict[str, Any] = {},
|
||||
id_hash_keys: Optional[List[str]] = None) -> Document
|
||||
```
|
||||
|
||||
Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
|
||||
@ -229,7 +238,19 @@ class Label()
|
||||
#### Label.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(query: str, document: Document, is_correct_answer: bool, is_correct_document: bool, origin: Literal["user-feedback", "gold-label"], answer: Optional[Answer], id: Optional[str] = None, no_answer: Optional[bool] = None, pipeline_id: Optional[str] = None, created_at: Optional[str] = None, updated_at: Optional[str] = None, meta: Optional[dict] = None, filters: Optional[dict] = None)
|
||||
def __init__(query: str,
|
||||
document: Document,
|
||||
is_correct_answer: bool,
|
||||
is_correct_document: bool,
|
||||
origin: Literal["user-feedback", "gold-label"],
|
||||
answer: Optional[Answer],
|
||||
id: Optional[str] = None,
|
||||
no_answer: Optional[bool] = None,
|
||||
pipeline_id: Optional[str] = None,
|
||||
created_at: Optional[str] = None,
|
||||
updated_at: Optional[str] = None,
|
||||
meta: Optional[dict] = None,
|
||||
filters: Optional[dict] = None)
|
||||
```
|
||||
|
||||
Object used to represent label/feedback in a standardized way within Haystack.
|
||||
@ -272,7 +293,10 @@ class MultiLabel()
|
||||
#### MultiLabel.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(labels: List[Label], drop_negative_labels=False, drop_no_answers=False, **kwargs)
|
||||
def __init__(labels: List[Label],
|
||||
drop_negative_labels=False,
|
||||
drop_no_answers=False,
|
||||
**kwargs)
|
||||
```
|
||||
|
||||
There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated
|
||||
@ -382,14 +406,17 @@ The DataFrames have the following schema:
|
||||
#### EvaluationResult.calculate\_metrics
|
||||
|
||||
```python
|
||||
def calculate_metrics(simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[
|
||||
"document_id",
|
||||
"context",
|
||||
"document_id_and_context",
|
||||
"document_id_or_context",
|
||||
"answer",
|
||||
"document_id_or_answer",
|
||||
] = "document_id_or_answer", eval_mode: Literal["integrated", "isolated"] = "integrated", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any") -> Dict[str, Dict[str, float]]
|
||||
def calculate_metrics(
|
||||
simulated_top_k_reader: int = -1,
|
||||
simulated_top_k_retriever: int = -1,
|
||||
document_scope: Literal[
|
||||
"document_id", "context", "document_id_and_context",
|
||||
"document_id_or_context", "answer",
|
||||
"document_id_or_answer", ] = "document_id_or_answer",
|
||||
eval_mode: Literal["integrated", "isolated"] = "integrated",
|
||||
answer_scope: Literal["any", "context", "document_id",
|
||||
"document_id_and_context"] = "any"
|
||||
) -> Dict[str, Dict[str, float]]
|
||||
```
|
||||
|
||||
Calculates proper metrics for each node.
|
||||
@ -457,14 +484,23 @@ In Question Answering, to enforce that the retrieved document is considered corr
|
||||
#### EvaluationResult.wrong\_examples
|
||||
|
||||
```python
|
||||
def wrong_examples(node: str, n: int = 3, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[
|
||||
"document_id",
|
||||
"context",
|
||||
"document_id_and_context",
|
||||
"document_id_or_context",
|
||||
"answer",
|
||||
"document_id_or_answer",
|
||||
] = "document_id_or_answer", document_metric: str = "recall_single_hit", answer_metric: str = "f1", document_metric_threshold: float = 0.5, answer_metric_threshold: float = 0.5, eval_mode: Literal["integrated", "isolated"] = "integrated", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any") -> List[Dict]
|
||||
def wrong_examples(
|
||||
node: str,
|
||||
n: int = 3,
|
||||
simulated_top_k_reader: int = -1,
|
||||
simulated_top_k_retriever: int = -1,
|
||||
document_scope: Literal[
|
||||
"document_id", "context", "document_id_and_context",
|
||||
"document_id_or_context", "answer",
|
||||
"document_id_or_answer", ] = "document_id_or_answer",
|
||||
document_metric: str = "recall_single_hit",
|
||||
answer_metric: str = "f1",
|
||||
document_metric_threshold: float = 0.5,
|
||||
answer_metric_threshold: float = 0.5,
|
||||
eval_mode: Literal["integrated", "isolated"] = "integrated",
|
||||
answer_scope: Literal["any", "context", "document_id",
|
||||
"document_id_and_context"] = "any"
|
||||
) -> List[Dict]
|
||||
```
|
||||
|
||||
Returns the worst performing queries.
|
||||
|
||||
@ -53,7 +53,17 @@ For example:
|
||||
#### PseudoLabelGenerator.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], retriever: BaseRetriever, cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]],
|
||||
retriever: BaseRetriever,
|
||||
cross_encoder_model_name_or_path:
|
||||
str = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
max_questions_per_document: int = 3,
|
||||
top_k: int = 50,
|
||||
batch_size: int = 16,
|
||||
progress_bar: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
use_gpu: bool = True,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
Loads the cross-encoder model and prepares PseudoLabelGenerator.
|
||||
@ -84,7 +94,9 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### PseudoLabelGenerator.generate\_questions
|
||||
|
||||
```python
|
||||
def generate_questions(documents: List[Document], batch_size: Optional[int] = None) -> List[Dict[str, str]]
|
||||
def generate_questions(
|
||||
documents: List[Document],
|
||||
batch_size: Optional[int] = None) -> List[Dict[str, str]]
|
||||
```
|
||||
|
||||
It takes a list of documents and generates a list of question-document pairs.
|
||||
@ -103,7 +115,8 @@ A list of question-document pairs.
|
||||
#### PseudoLabelGenerator.mine\_negatives
|
||||
|
||||
```python
|
||||
def mine_negatives(question_doc_pairs: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict[str, str]]
|
||||
def mine_negatives(question_doc_pairs: List[Dict[str, str]],
|
||||
batch_size: Optional[int] = None) -> List[Dict[str, str]]
|
||||
```
|
||||
|
||||
Given a list of question and positive document pairs, this function returns a list of question/positive document/negative document
|
||||
@ -125,7 +138,8 @@ and negative document.
|
||||
#### PseudoLabelGenerator.generate\_margin\_scores
|
||||
|
||||
```python
|
||||
def generate_margin_scores(mined_negatives: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict]
|
||||
def generate_margin_scores(mined_negatives: List[Dict[str, str]],
|
||||
batch_size: Optional[int] = None) -> List[Dict]
|
||||
```
|
||||
|
||||
Given a list of mined negatives, this function predicts the score margin between the positive and negative document using
|
||||
@ -157,7 +171,9 @@ A list of dictionaries, each of which has the following keys:
|
||||
#### PseudoLabelGenerator.generate\_pseudo\_labels
|
||||
|
||||
```python
|
||||
def generate_pseudo_labels(documents: List[Document], batch_size: Optional[int] = None) -> Tuple[dict, str]
|
||||
def generate_pseudo_labels(
|
||||
documents: List[Document],
|
||||
batch_size: Optional[int] = None) -> Tuple[dict, str]
|
||||
```
|
||||
|
||||
Given a list of documents, this function generates a list of question-document pairs, mines for negatives, and
|
||||
|
||||
@ -69,11 +69,17 @@ and the further processing can be customized. You can define this by connecting
|
||||
#### SklearnQueryClassifier.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: Union[
|
||||
str, Any
|
||||
] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", vectorizer_name_or_path: Union[
|
||||
str, Any
|
||||
] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle", batch_size: Optional[int] = None, progress_bar: bool = True)
|
||||
def __init__(
|
||||
model_name_or_path:
|
||||
Union[
|
||||
str,
|
||||
Any] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle",
|
||||
vectorizer_name_or_path:
|
||||
Union[
|
||||
str,
|
||||
Any] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle",
|
||||
batch_size: Optional[int] = None,
|
||||
progress_bar: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -144,7 +150,17 @@ This node also supports zero-shot-classification.
|
||||
#### TransformersQueryClassifier.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, task: str = "text-classification", labels: List[str] = DEFAULT_LABELS, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
def __init__(model_name_or_path: Union[
|
||||
Path, str] = "shahrukhx01/bert-mini-finetune-question-detection",
|
||||
model_version: Optional[str] = None,
|
||||
tokenizer: Optional[str] = None,
|
||||
use_gpu: bool = True,
|
||||
task: str = "text-classification",
|
||||
labels: List[str] = DEFAULT_LABELS,
|
||||
batch_size: int = 16,
|
||||
progress_bar: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
@ -23,7 +23,23 @@ come from earlier in the document.
|
||||
#### QuestionGenerator.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", num_queries_per_doc=1, sep_token: str = "<sep>", batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
def __init__(model_name_or_path="valhalla/t5-base-e2e-qg",
|
||||
model_version=None,
|
||||
num_beams=4,
|
||||
max_length=256,
|
||||
no_repeat_ngram_size=3,
|
||||
length_penalty=1.5,
|
||||
early_stopping=True,
|
||||
split_length=50,
|
||||
split_overlap=10,
|
||||
use_gpu=True,
|
||||
prompt="generate questions:",
|
||||
num_queries_per_doc=1,
|
||||
sep_token: str = "<sep>",
|
||||
batch_size: int = 16,
|
||||
progress_bar: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is
|
||||
@ -55,7 +71,10 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### QuestionGenerator.generate\_batch
|
||||
|
||||
```python
|
||||
def generate_batch(texts: Union[List[str], List[List[str]]], batch_size: Optional[int] = None) -> Union[List[List[str]], List[List[List[str]]]]
|
||||
def generate_batch(
|
||||
texts: Union[List[str], List[List[str]]],
|
||||
batch_size: Optional[int] = None
|
||||
) -> Union[List[List[str]], List[List[List[str]]]]
|
||||
```
|
||||
|
||||
Generates questions for a list of strings or a list of lists of strings.
|
||||
|
||||
@ -25,7 +25,12 @@ Wrapper method used to time functions.
|
||||
#### BaseRanker.eval
|
||||
|
||||
```python
|
||||
def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False) -> dict
|
||||
def eval(label_index: str = "label",
|
||||
doc_index: str = "eval_document",
|
||||
label_origin: str = "gold_label",
|
||||
top_k: int = 10,
|
||||
open_domain: bool = False,
|
||||
return_preds: bool = False) -> dict
|
||||
```
|
||||
|
||||
Performs evaluation of the Ranker.
|
||||
@ -94,7 +99,15 @@ Usage example:
|
||||
#### SentenceTransformersRanker.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, top_k: int = 10, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, batch_size: int = 16, scale_score: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None)
|
||||
def __init__(model_name_or_path: Union[str, Path],
|
||||
model_version: Optional[str] = None,
|
||||
top_k: int = 10,
|
||||
use_gpu: bool = True,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||
batch_size: int = 16,
|
||||
scale_score: bool = True,
|
||||
progress_bar: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -125,7 +138,9 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### SentenceTransformersRanker.predict
|
||||
|
||||
```python
|
||||
def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> List[Document]
|
||||
def predict(query: str,
|
||||
documents: List[Document],
|
||||
top_k: Optional[int] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Use loaded ranker model to re-rank the supplied list of Document.
|
||||
@ -147,7 +162,12 @@ List of Document
|
||||
#### SentenceTransformersRanker.predict\_batch
|
||||
|
||||
```python
|
||||
def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]]
|
||||
def predict_batch(
|
||||
queries: List[str],
|
||||
documents: Union[List[Document], List[List[Document]]],
|
||||
top_k: Optional[int] = None,
|
||||
batch_size: Optional[int] = None
|
||||
) -> Union[List[Document], List[List[Document]]]
|
||||
```
|
||||
|
||||
Use loaded ranker model to re-rank the supplied lists of Documents.
|
||||
|
||||
@ -45,7 +45,28 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf
|
||||
#### FARMReader.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None)
|
||||
def __init__(model_name_or_path: str,
|
||||
model_version: Optional[str] = None,
|
||||
context_window_size: int = 150,
|
||||
batch_size: int = 50,
|
||||
use_gpu: bool = True,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||
no_ans_boost: float = 0.0,
|
||||
return_no_answer: bool = False,
|
||||
top_k: int = 10,
|
||||
top_k_per_candidate: int = 3,
|
||||
top_k_per_sample: int = 1,
|
||||
num_processes: Optional[int] = None,
|
||||
max_seq_len: int = 256,
|
||||
doc_stride: int = 128,
|
||||
progress_bar: bool = True,
|
||||
duplicate_filtering: int = 0,
|
||||
use_confidence_scores: bool = True,
|
||||
confidence_threshold: Optional[float] = None,
|
||||
proxies: Optional[Dict[str, str]] = None,
|
||||
local_files_only=False,
|
||||
force_download=False,
|
||||
use_auth_token: Optional[Union[str, bool]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -113,7 +134,29 @@ https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrai
|
||||
#### FARMReader.train
|
||||
|
||||
```python
|
||||
def train(data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), grad_acc_steps: int = 1, early_stopping: Optional[EarlyStopping] = None)
|
||||
def train(data_dir: str,
|
||||
train_filename: str,
|
||||
dev_filename: Optional[str] = None,
|
||||
test_filename: Optional[str] = None,
|
||||
use_gpu: Optional[bool] = None,
|
||||
devices: List[torch.device] = [],
|
||||
batch_size: int = 10,
|
||||
n_epochs: int = 2,
|
||||
learning_rate: float = 1e-5,
|
||||
max_seq_len: Optional[int] = None,
|
||||
warmup_proportion: float = 0.2,
|
||||
dev_split: float = 0,
|
||||
evaluate_every: int = 300,
|
||||
save_dir: Optional[str] = None,
|
||||
num_processes: Optional[int] = None,
|
||||
use_amp: str = None,
|
||||
checkpoint_root_dir: Path = Path("model_checkpoints"),
|
||||
checkpoint_every: Optional[int] = None,
|
||||
checkpoints_to_keep: int = 3,
|
||||
caching: bool = False,
|
||||
cache_path: Path = Path("cache/data_silo"),
|
||||
grad_acc_steps: int = 1,
|
||||
early_stopping: Optional[EarlyStopping] = None)
|
||||
```
|
||||
|
||||
Fine-tune a model on a QA dataset. Options:
|
||||
@ -176,7 +219,36 @@ None
|
||||
#### FARMReader.distil\_prediction\_layer\_from
|
||||
|
||||
```python
|
||||
def distil_prediction_layer_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], student_batch_size: int = 10, teacher_batch_size: Optional[int] = None, n_epochs: int = 2, learning_rate: float = 3e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss_weight: float = 0.5, distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "kl_div", temperature: float = 1.0, grad_acc_steps: int = 1, early_stopping: Optional[EarlyStopping] = None)
|
||||
def distil_prediction_layer_from(
|
||||
teacher_model: "FARMReader",
|
||||
data_dir: str,
|
||||
train_filename: str,
|
||||
dev_filename: Optional[str] = None,
|
||||
test_filename: Optional[str] = None,
|
||||
use_gpu: Optional[bool] = None,
|
||||
devices: List[torch.device] = [],
|
||||
student_batch_size: int = 10,
|
||||
teacher_batch_size: Optional[int] = None,
|
||||
n_epochs: int = 2,
|
||||
learning_rate: float = 3e-5,
|
||||
max_seq_len: Optional[int] = None,
|
||||
warmup_proportion: float = 0.2,
|
||||
dev_split: float = 0,
|
||||
evaluate_every: int = 300,
|
||||
save_dir: Optional[str] = None,
|
||||
num_processes: Optional[int] = None,
|
||||
use_amp: str = None,
|
||||
checkpoint_root_dir: Path = Path("model_checkpoints"),
|
||||
checkpoint_every: Optional[int] = None,
|
||||
checkpoints_to_keep: int = 3,
|
||||
caching: bool = False,
|
||||
cache_path: Path = Path("cache/data_silo"),
|
||||
distillation_loss_weight: float = 0.5,
|
||||
distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor],
|
||||
torch.Tensor]] = "kl_div",
|
||||
temperature: float = 1.0,
|
||||
grad_acc_steps: int = 1,
|
||||
early_stopping: Optional[EarlyStopping] = None)
|
||||
```
|
||||
|
||||
Fine-tune a model on a QA dataset using logit-based distillation. You need to provide a teacher model that is already finetuned on the dataset
|
||||
@ -258,7 +330,35 @@ None
|
||||
#### FARMReader.distil\_intermediate\_layers\_from
|
||||
|
||||
```python
|
||||
def distil_intermediate_layers_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 5, learning_rate: float = 5e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "mse", temperature: float = 1.0, processor: Optional[Processor] = None, grad_acc_steps: int = 1, early_stopping: Optional[EarlyStopping] = None)
|
||||
def distil_intermediate_layers_from(
|
||||
teacher_model: "FARMReader",
|
||||
data_dir: str,
|
||||
train_filename: str,
|
||||
dev_filename: Optional[str] = None,
|
||||
test_filename: Optional[str] = None,
|
||||
use_gpu: Optional[bool] = None,
|
||||
devices: List[torch.device] = [],
|
||||
batch_size: int = 10,
|
||||
n_epochs: int = 5,
|
||||
learning_rate: float = 5e-5,
|
||||
max_seq_len: Optional[int] = None,
|
||||
warmup_proportion: float = 0.2,
|
||||
dev_split: float = 0,
|
||||
evaluate_every: int = 300,
|
||||
save_dir: Optional[str] = None,
|
||||
num_processes: Optional[int] = None,
|
||||
use_amp: str = None,
|
||||
checkpoint_root_dir: Path = Path("model_checkpoints"),
|
||||
checkpoint_every: Optional[int] = None,
|
||||
checkpoints_to_keep: int = 3,
|
||||
caching: bool = False,
|
||||
cache_path: Path = Path("cache/data_silo"),
|
||||
distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor],
|
||||
torch.Tensor]] = "mse",
|
||||
temperature: float = 1.0,
|
||||
processor: Optional[Processor] = None,
|
||||
grad_acc_steps: int = 1,
|
||||
early_stopping: Optional[EarlyStopping] = None)
|
||||
```
|
||||
|
||||
The first stage of distillation finetuning as described in the TinyBERT paper:
|
||||
@ -332,7 +432,11 @@ None
|
||||
#### FARMReader.update\_parameters
|
||||
|
||||
```python
|
||||
def update_parameters(context_window_size: Optional[int] = None, no_ans_boost: Optional[float] = None, return_no_answer: Optional[bool] = None, max_seq_len: Optional[int] = None, doc_stride: Optional[int] = None)
|
||||
def update_parameters(context_window_size: Optional[int] = None,
|
||||
no_ans_boost: Optional[float] = None,
|
||||
return_no_answer: Optional[bool] = None,
|
||||
max_seq_len: Optional[int] = None,
|
||||
doc_stride: Optional[int] = None)
|
||||
```
|
||||
|
||||
Hot update parameters of a loaded Reader. It may not to be safe when processing concurrent requests.
|
||||
@ -356,7 +460,9 @@ Saves the Reader model so that it can be reused at a later point in time.
|
||||
#### FARMReader.save\_to\_remote
|
||||
|
||||
```python
|
||||
def save_to_remote(repo_id: str, private: Optional[bool] = None, commit_message: str = "Add new model to Hugging Face.")
|
||||
def save_to_remote(repo_id: str,
|
||||
private: Optional[bool] = None,
|
||||
commit_message: str = "Add new model to Hugging Face.")
|
||||
```
|
||||
|
||||
Saves the Reader model to Hugging Face Model Hub with the given model_name. For this to work:
|
||||
@ -375,7 +481,10 @@ Saves the Reader model to Hugging Face Model Hub with the given model_name. For
|
||||
#### FARMReader.predict\_batch
|
||||
|
||||
```python
|
||||
def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
|
||||
def predict_batch(queries: List[str],
|
||||
documents: Union[List[Document], List[List[Document]]],
|
||||
top_k: Optional[int] = None,
|
||||
batch_size: Optional[int] = None)
|
||||
```
|
||||
|
||||
Use loaded QA model to find answers for the queries in the Documents.
|
||||
@ -405,7 +514,9 @@ Can be a single list of Documents or a list of lists of Documents.
|
||||
#### FARMReader.predict
|
||||
|
||||
```python
|
||||
def predict(query: str, documents: List[Document], top_k: Optional[int] = None)
|
||||
def predict(query: str,
|
||||
documents: List[Document],
|
||||
top_k: Optional[int] = None)
|
||||
```
|
||||
|
||||
Use loaded QA model to find answers for a query in the supplied list of Document.
|
||||
@ -442,7 +553,10 @@ Dict containing query and answers
|
||||
#### FARMReader.eval\_on\_file
|
||||
|
||||
```python
|
||||
def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None, calibrate_conf_scores: bool = False)
|
||||
def eval_on_file(data_dir: Union[Path, str],
|
||||
test_filename: str,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
calibrate_conf_scores: bool = False)
|
||||
```
|
||||
|
||||
Performs evaluation on a SQuAD-formatted file.
|
||||
@ -466,7 +580,12 @@ or use the Reader's device by default.
|
||||
#### FARMReader.eval
|
||||
|
||||
```python
|
||||
def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False)
|
||||
def eval(document_store: BaseDocumentStore,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
label_index: str = "label",
|
||||
doc_index: str = "eval_document",
|
||||
label_origin: str = "gold-label",
|
||||
calibrate_conf_scores: bool = False)
|
||||
```
|
||||
|
||||
Performs evaluation on evaluation documents in the DocumentStore.
|
||||
@ -492,7 +611,12 @@ or use the Reader's device by default.
|
||||
#### FARMReader.calibrate\_confidence\_scores
|
||||
|
||||
```python
|
||||
def calibrate_confidence_scores(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label")
|
||||
def calibrate_confidence_scores(document_store: BaseDocumentStore,
|
||||
device: Optional[Union[str,
|
||||
torch.device]] = None,
|
||||
label_index: str = "label",
|
||||
doc_index: str = "eval_document",
|
||||
label_origin: str = "gold_label")
|
||||
```
|
||||
|
||||
Calibrates confidence scores on evaluation documents in the DocumentStore.
|
||||
@ -512,7 +636,9 @@ or use the Reader's device by default.
|
||||
#### FARMReader.predict\_on\_texts
|
||||
|
||||
```python
|
||||
def predict_on_texts(question: str, texts: List[str], top_k: Optional[int] = None)
|
||||
def predict_on_texts(question: str,
|
||||
texts: List[str],
|
||||
top_k: Optional[int] = None)
|
||||
```
|
||||
|
||||
Use loaded QA model to find answers for a question in the supplied list of Document.
|
||||
@ -550,7 +676,13 @@ Dict containing question and answers
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def convert_to_onnx(cls, model_name: str, output_path: Path, convert_to_float16: bool = False, quantize: bool = False, task_type: str = "question_answering", opset_version: int = 11)
|
||||
def convert_to_onnx(cls,
|
||||
model_name: str,
|
||||
output_path: Path,
|
||||
convert_to_float16: bool = False,
|
||||
quantize: bool = False,
|
||||
task_type: str = "question_answering",
|
||||
opset_version: int = 11)
|
||||
```
|
||||
|
||||
Convert a PyTorch BERT model to ONNX format and write to ./onnx-export dir. The converted ONNX model
|
||||
@ -598,7 +730,20 @@ With this reader, you can directly get predictions via predict()
|
||||
#### TransformersReader.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", model_version: Optional[str] = None, tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answers: bool = False, max_seq_len: int = 256, doc_stride: int = 128, batch_size: int = 16, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
def __init__(
|
||||
model_name_or_path: str = "distilbert-base-uncased-distilled-squad",
|
||||
model_version: Optional[str] = None,
|
||||
tokenizer: Optional[str] = None,
|
||||
context_window_size: int = 70,
|
||||
use_gpu: bool = True,
|
||||
top_k: int = 10,
|
||||
top_k_per_candidate: int = 3,
|
||||
return_no_answers: bool = False,
|
||||
max_seq_len: int = 256,
|
||||
doc_stride: int = 128,
|
||||
batch_size: int = 16,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
Load a QA model from Transformers.
|
||||
@ -647,7 +792,9 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### TransformersReader.predict
|
||||
|
||||
```python
|
||||
def predict(query: str, documents: List[Document], top_k: Optional[int] = None)
|
||||
def predict(query: str,
|
||||
documents: List[Document],
|
||||
top_k: Optional[int] = None)
|
||||
```
|
||||
|
||||
Use loaded QA model to find answers for a query in the supplied list of Document.
|
||||
@ -685,7 +832,10 @@ Dict containing query and answers
|
||||
#### TransformersReader.predict\_batch
|
||||
|
||||
```python
|
||||
def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
|
||||
def predict_batch(queries: List[str],
|
||||
documents: Union[List[Document], List[List[Document]]],
|
||||
top_k: Optional[int] = None,
|
||||
batch_size: Optional[int] = None)
|
||||
```
|
||||
|
||||
Use loaded QA model to find answers for the queries in the Documents.
|
||||
@ -752,7 +902,16 @@ answer = prediction["answers"][0].answer # "10 june 1996"
|
||||
#### TableReader.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answer: bool = False, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq",
|
||||
model_version: Optional[str] = None,
|
||||
tokenizer: Optional[str] = None,
|
||||
use_gpu: bool = True,
|
||||
top_k: int = 10,
|
||||
top_k_per_candidate: int = 3,
|
||||
return_no_answer: bool = False,
|
||||
max_seq_len: int = 256,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
Load a TableQA model from Transformers.
|
||||
@ -803,7 +962,9 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### TableReader.predict
|
||||
|
||||
```python
|
||||
def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
|
||||
def predict(query: str,
|
||||
documents: List[Document],
|
||||
top_k: Optional[int] = None) -> Dict
|
||||
```
|
||||
|
||||
Use loaded TableQA model to find answers for a query in the supplied list of Documents
|
||||
@ -830,7 +991,10 @@ Dict containing query and answers
|
||||
#### TableReader.predict\_batch
|
||||
|
||||
```python
|
||||
def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None)
|
||||
def predict_batch(queries: List[str],
|
||||
documents: Union[List[Document], List[List[Document]]],
|
||||
top_k: Optional[int] = None,
|
||||
batch_size: Optional[int] = None)
|
||||
```
|
||||
|
||||
Use loaded TableQA model to find answers for the supplied queries in the supplied Documents
|
||||
@ -890,7 +1054,18 @@ Pros and Cons of RCIReader compared to TableReader:
|
||||
#### RCIReader.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(row_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-row", column_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-col", row_model_version: Optional[str] = None, column_model_version: Optional[str] = None, row_tokenizer: Optional[str] = None, column_tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, max_seq_len: int = 256, use_auth_token: Optional[Union[str, bool]] = None)
|
||||
def __init__(row_model_name_or_path:
|
||||
str = "michaelrglass/albert-base-rci-wikisql-row",
|
||||
column_model_name_or_path:
|
||||
str = "michaelrglass/albert-base-rci-wikisql-col",
|
||||
row_model_version: Optional[str] = None,
|
||||
column_model_version: Optional[str] = None,
|
||||
row_tokenizer: Optional[str] = None,
|
||||
column_tokenizer: Optional[str] = None,
|
||||
use_gpu: bool = True,
|
||||
top_k: int = 10,
|
||||
max_seq_len: int = 256,
|
||||
use_auth_token: Optional[Union[str, bool]] = None)
|
||||
```
|
||||
|
||||
Load an RCI model from Transformers.
|
||||
@ -926,7 +1101,9 @@ https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrai
|
||||
#### RCIReader.predict
|
||||
|
||||
```python
|
||||
def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict
|
||||
def predict(query: str,
|
||||
documents: List[Document],
|
||||
top_k: Optional[int] = None) -> Dict
|
||||
```
|
||||
|
||||
Use loaded RCI models to find answers for a query in the supplied list of Documents
|
||||
|
||||
@ -28,7 +28,13 @@ Base class for regular retrievers.
|
||||
|
||||
```python
|
||||
@abstractmethod
|
||||
def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
|
||||
def retrieve(query: str,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float,
|
||||
bool]]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
scale_score: bool = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -61,7 +67,13 @@ Wrapper method used to time functions.
|
||||
#### BaseRetriever.eval
|
||||
|
||||
```python
|
||||
def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False, headers: Optional[Dict[str, str]] = None) -> dict
|
||||
def eval(label_index: str = "label",
|
||||
doc_index: str = "eval_document",
|
||||
label_origin: str = "gold-label",
|
||||
top_k: int = 10,
|
||||
open_domain: bool = False,
|
||||
return_preds: bool = False,
|
||||
headers: Optional[Dict[str, str]] = None) -> dict
|
||||
```
|
||||
|
||||
Performs evaluation on the Retriever.
|
||||
@ -110,7 +122,11 @@ class BM25Retriever(BaseRetriever)
|
||||
#### BM25Retriever.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(document_store: KeywordDocumentStore, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None, scale_score: bool = True)
|
||||
def __init__(document_store: KeywordDocumentStore,
|
||||
top_k: int = 10,
|
||||
all_terms_must_match: bool = False,
|
||||
custom_query: Optional[str] = None,
|
||||
scale_score: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -194,7 +210,13 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
#### BM25Retriever.retrieve
|
||||
|
||||
```python
|
||||
def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
|
||||
def retrieve(query: str,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float,
|
||||
bool]]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
scale_score: bool = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -280,12 +302,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
#### BM25Retriever.retrieve\_batch
|
||||
|
||||
```python
|
||||
def retrieve_batch(queries: List[str], filters: Optional[
|
||||
Union[
|
||||
Dict[str, Union[Dict, List, str, int, float, bool]],
|
||||
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
|
||||
]
|
||||
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
|
||||
def retrieve_batch(queries: List[str],
|
||||
filters: Optional[Union[Dict[str, Union[Dict, List, str,
|
||||
int, float, bool]],
|
||||
List[Dict[str,
|
||||
Union[Dict, List, str,
|
||||
int, float,
|
||||
bool]]], ]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
batch_size: Optional[int] = None,
|
||||
scale_score: bool = None) -> List[List[Document]]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -386,7 +414,12 @@ Helpful for benchmarking, testing and if you want to do QA on small documents wi
|
||||
#### FilterRetriever.retrieve
|
||||
|
||||
```python
|
||||
def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
|
||||
def retrieve(query: str,
|
||||
filters: dict = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
scale_score: bool = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -425,7 +458,9 @@ It uses sklearn's TfidfVectorizer to compute a tf-idf matrix.
|
||||
#### TfidfRetriever.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True)
|
||||
def __init__(document_store: BaseDocumentStore,
|
||||
top_k: int = 10,
|
||||
auto_fit=True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -439,12 +474,16 @@ def __init__(document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True)
|
||||
#### TfidfRetriever.retrieve
|
||||
|
||||
```python
|
||||
def retrieve(query: str, filters: Optional[
|
||||
Union[
|
||||
Dict[str, Union[Dict, List, str, int, float, bool]],
|
||||
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
|
||||
]
|
||||
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
|
||||
def retrieve(query: str,
|
||||
filters: Optional[Union[Dict[str, Union[Dict, List, str, int,
|
||||
float, bool]],
|
||||
List[Dict[str,
|
||||
Union[Dict, List, str, int,
|
||||
float, bool]]], ]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
scale_score: bool = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -466,7 +505,14 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
#### TfidfRetriever.retrieve\_batch
|
||||
|
||||
```python
|
||||
def retrieve_batch(queries: Union[str, List[str]], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
|
||||
def retrieve_batch(queries: Union[str, List[str]],
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int,
|
||||
float, bool]]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
batch_size: Optional[int] = None,
|
||||
scale_score: bool = None) -> List[List[Document]]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -519,7 +565,25 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que
|
||||
#### DensePassageRetriever.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True)
|
||||
def __init__(document_store: BaseDocumentStore,
|
||||
query_embedding_model: Union[
|
||||
Path, str] = "facebook/dpr-question_encoder-single-nq-base",
|
||||
passage_embedding_model: Union[
|
||||
Path, str] = "facebook/dpr-ctx_encoder-single-nq-base",
|
||||
model_version: Optional[str] = None,
|
||||
max_seq_len_query: int = 64,
|
||||
max_seq_len_passage: int = 256,
|
||||
top_k: int = 10,
|
||||
use_gpu: bool = True,
|
||||
batch_size: int = 16,
|
||||
embed_title: bool = True,
|
||||
use_fast_tokenizers: bool = True,
|
||||
similarity_function: str = "dot_product",
|
||||
global_loss_buffer_size: int = 150000,
|
||||
progress_bar: bool = True,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
scale_score: bool = True)
|
||||
```
|
||||
|
||||
Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
|
||||
@ -587,7 +651,13 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
#### DensePassageRetriever.retrieve
|
||||
|
||||
```python
|
||||
def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
|
||||
def retrieve(query: str,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float,
|
||||
bool]]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
scale_score: bool = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -671,12 +741,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
#### DensePassageRetriever.retrieve\_batch
|
||||
|
||||
```python
|
||||
def retrieve_batch(queries: List[str], filters: Optional[
|
||||
Union[
|
||||
Dict[str, Union[Dict, List, str, int, float, bool]],
|
||||
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
|
||||
]
|
||||
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
|
||||
def retrieve_batch(queries: List[str],
|
||||
filters: Optional[Union[Dict[str, Union[Dict, List, str,
|
||||
int, float, bool]],
|
||||
List[Dict[str,
|
||||
Union[Dict, List, str,
|
||||
int, float,
|
||||
bool]]], ]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
batch_size: Optional[int] = None,
|
||||
scale_score: bool = None) -> List[List[Document]]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -802,7 +878,36 @@ Embeddings of documents / passages shape (batch_size, embedding_dim)
|
||||
#### DensePassageRetriever.train
|
||||
|
||||
```python
|
||||
def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, multiprocessing_strategy: Optional[str] = None, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, early_stopping: Optional[EarlyStopping] = None)
|
||||
def train(data_dir: str,
|
||||
train_filename: str,
|
||||
dev_filename: str = None,
|
||||
test_filename: str = None,
|
||||
max_samples: int = None,
|
||||
max_processes: int = 128,
|
||||
multiprocessing_strategy: Optional[str] = None,
|
||||
dev_split: float = 0,
|
||||
batch_size: int = 2,
|
||||
embed_title: bool = True,
|
||||
num_hard_negatives: int = 1,
|
||||
num_positives: int = 1,
|
||||
n_epochs: int = 3,
|
||||
evaluate_every: int = 1000,
|
||||
n_gpu: int = 1,
|
||||
learning_rate: float = 1e-5,
|
||||
epsilon: float = 1e-08,
|
||||
weight_decay: float = 0.0,
|
||||
num_warmup_steps: int = 100,
|
||||
grad_acc_steps: int = 1,
|
||||
use_amp: str = None,
|
||||
optimizer_name: str = "AdamW",
|
||||
optimizer_correct_bias: bool = True,
|
||||
save_dir: str = "../saved_models/dpr",
|
||||
query_encoder_save_dir: str = "query_encoder",
|
||||
passage_encoder_save_dir: str = "passage_encoder",
|
||||
checkpoint_root_dir: Path = Path("model_checkpoints"),
|
||||
checkpoint_every: Optional[int] = None,
|
||||
checkpoints_to_keep: int = 3,
|
||||
early_stopping: Optional[EarlyStopping] = None)
|
||||
```
|
||||
|
||||
train a DensePassageRetrieval model
|
||||
@ -856,7 +961,9 @@ If any checkpoints are stored, a subsequent run of train() will resume training
|
||||
#### DensePassageRetriever.save
|
||||
|
||||
```python
|
||||
def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder")
|
||||
def save(save_dir: Union[Path, str],
|
||||
query_encoder_dir: str = "query_encoder",
|
||||
passage_encoder_dir: str = "passage_encoder")
|
||||
```
|
||||
|
||||
Save DensePassageRetriever to the specified directory.
|
||||
@ -877,7 +984,18 @@ None
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder")
|
||||
def load(cls,
|
||||
load_dir: Union[Path, str],
|
||||
document_store: BaseDocumentStore,
|
||||
max_seq_len_query: int = 64,
|
||||
max_seq_len_passage: int = 256,
|
||||
use_gpu: bool = True,
|
||||
batch_size: int = 16,
|
||||
embed_title: bool = True,
|
||||
use_fast_tokenizers: bool = True,
|
||||
similarity_function: str = "dot_product",
|
||||
query_encoder_dir: str = "query_encoder",
|
||||
passage_encoder_dir: str = "passage_encoder")
|
||||
```
|
||||
|
||||
Load DensePassageRetriever from the specified directory.
|
||||
@ -901,7 +1019,30 @@ Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using
|
||||
#### TableTextRetriever.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, use_fast: bool = True)
|
||||
def __init__(
|
||||
document_store: BaseDocumentStore,
|
||||
query_embedding_model: Union[
|
||||
Path, str] = "deepset/bert-small-mm_retrieval-question_encoder",
|
||||
passage_embedding_model: Union[
|
||||
Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder",
|
||||
table_embedding_model: Union[
|
||||
Path, str] = "deepset/bert-small-mm_retrieval-table_encoder",
|
||||
model_version: Optional[str] = None,
|
||||
max_seq_len_query: int = 64,
|
||||
max_seq_len_passage: int = 256,
|
||||
max_seq_len_table: int = 256,
|
||||
top_k: int = 10,
|
||||
use_gpu: bool = True,
|
||||
batch_size: int = 16,
|
||||
embed_meta_fields: List[str] = ["name", "section_title", "caption"],
|
||||
use_fast_tokenizers: bool = True,
|
||||
similarity_function: str = "dot_product",
|
||||
global_loss_buffer_size: int = 150000,
|
||||
progress_bar: bool = True,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
scale_score: bool = True,
|
||||
use_fast: bool = True)
|
||||
```
|
||||
|
||||
Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
|
||||
@ -956,12 +1097,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
#### TableTextRetriever.retrieve\_batch
|
||||
|
||||
```python
|
||||
def retrieve_batch(queries: List[str], filters: Optional[
|
||||
Union[
|
||||
Dict[str, Union[Dict, List, str, int, float, bool]],
|
||||
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
|
||||
]
|
||||
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
|
||||
def retrieve_batch(queries: List[str],
|
||||
filters: Optional[Union[Dict[str, Union[Dict, List, str,
|
||||
int, float, bool]],
|
||||
List[Dict[str,
|
||||
Union[Dict, List, str,
|
||||
int, float,
|
||||
bool]]], ]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
batch_size: Optional[int] = None,
|
||||
scale_score: bool = None) -> List[List[Document]]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -1090,7 +1237,38 @@ Embeddings of documents / passages. Shape: (batch_size, embedding_dim)
|
||||
#### TableTextRetriever.train
|
||||
|
||||
```python
|
||||
def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_meta_fields: List[str] = ["page_title", "section_title", "caption"], num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/mm_retrieval", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", table_encoder_save_dir: str = "table_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, early_stopping: Optional[EarlyStopping] = None)
|
||||
def train(data_dir: str,
|
||||
train_filename: str,
|
||||
dev_filename: str = None,
|
||||
test_filename: str = None,
|
||||
max_samples: int = None,
|
||||
max_processes: int = 128,
|
||||
dev_split: float = 0,
|
||||
batch_size: int = 2,
|
||||
embed_meta_fields: List[str] = [
|
||||
"page_title", "section_title", "caption"
|
||||
],
|
||||
num_hard_negatives: int = 1,
|
||||
num_positives: int = 1,
|
||||
n_epochs: int = 3,
|
||||
evaluate_every: int = 1000,
|
||||
n_gpu: int = 1,
|
||||
learning_rate: float = 1e-5,
|
||||
epsilon: float = 1e-08,
|
||||
weight_decay: float = 0.0,
|
||||
num_warmup_steps: int = 100,
|
||||
grad_acc_steps: int = 1,
|
||||
use_amp: str = None,
|
||||
optimizer_name: str = "AdamW",
|
||||
optimizer_correct_bias: bool = True,
|
||||
save_dir: str = "../saved_models/mm_retrieval",
|
||||
query_encoder_save_dir: str = "query_encoder",
|
||||
passage_encoder_save_dir: str = "passage_encoder",
|
||||
table_encoder_save_dir: str = "table_encoder",
|
||||
checkpoint_root_dir: Path = Path("model_checkpoints"),
|
||||
checkpoint_every: Optional[int] = None,
|
||||
checkpoints_to_keep: int = 3,
|
||||
early_stopping: Optional[EarlyStopping] = None)
|
||||
```
|
||||
|
||||
Train a TableTextRetrieval model.
|
||||
@ -1144,7 +1322,10 @@ checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is cr
|
||||
#### TableTextRetriever.save
|
||||
|
||||
```python
|
||||
def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder")
|
||||
def save(save_dir: Union[Path, str],
|
||||
query_encoder_dir: str = "query_encoder",
|
||||
passage_encoder_dir: str = "passage_encoder",
|
||||
table_encoder_dir: str = "table_encoder")
|
||||
```
|
||||
|
||||
Save TableTextRetriever to the specified directory.
|
||||
@ -1166,7 +1347,20 @@ None
|
||||
|
||||
```python
|
||||
@classmethod
|
||||
def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder")
|
||||
def load(cls,
|
||||
load_dir: Union[Path, str],
|
||||
document_store: BaseDocumentStore,
|
||||
max_seq_len_query: int = 64,
|
||||
max_seq_len_passage: int = 256,
|
||||
max_seq_len_table: int = 256,
|
||||
use_gpu: bool = True,
|
||||
batch_size: int = 16,
|
||||
embed_meta_fields: List[str] = ["name", "section_title", "caption"],
|
||||
use_fast_tokenizers: bool = True,
|
||||
similarity_function: str = "dot_product",
|
||||
query_encoder_dir: str = "query_encoder",
|
||||
passage_encoder_dir: str = "passage_encoder",
|
||||
table_encoder_dir: str = "table_encoder")
|
||||
```
|
||||
|
||||
Load TableTextRetriever from the specified directory.
|
||||
@ -1184,7 +1378,21 @@ class EmbeddingRetriever(BaseRetriever)
|
||||
#### EmbeddingRetriever.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: Optional[str] = None, pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = [])
|
||||
def __init__(document_store: BaseDocumentStore,
|
||||
embedding_model: str,
|
||||
model_version: Optional[str] = None,
|
||||
use_gpu: bool = True,
|
||||
batch_size: int = 32,
|
||||
max_seq_len: int = 512,
|
||||
model_format: Optional[str] = None,
|
||||
pooling_strategy: str = "reduce_mean",
|
||||
emb_extraction_layer: int = -1,
|
||||
top_k: int = 10,
|
||||
progress_bar: bool = True,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
scale_score: bool = True,
|
||||
embed_meta_fields: List[str] = [])
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -1239,7 +1447,13 @@ performance if your titles contain meaningful information for retrieval
|
||||
#### EmbeddingRetriever.retrieve
|
||||
|
||||
```python
|
||||
def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
|
||||
def retrieve(query: str,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float,
|
||||
bool]]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
scale_score: bool = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -1323,12 +1537,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
#### EmbeddingRetriever.retrieve\_batch
|
||||
|
||||
```python
|
||||
def retrieve_batch(queries: List[str], filters: Optional[
|
||||
Union[
|
||||
Dict[str, Union[Dict, List, str, int, float, bool]],
|
||||
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
|
||||
]
|
||||
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
|
||||
def retrieve_batch(queries: List[str],
|
||||
filters: Optional[Union[Dict[str, Union[Dict, List, str,
|
||||
int, float, bool]],
|
||||
List[Dict[str,
|
||||
Union[Dict, List, str,
|
||||
int, float,
|
||||
bool]]], ]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
batch_size: Optional[int] = None,
|
||||
scale_score: bool = None) -> List[List[Document]]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -1454,7 +1674,11 @@ Embeddings, one per input document
|
||||
#### EmbeddingRetriever.train
|
||||
|
||||
```python
|
||||
def train(training_data: List[Dict[str, Any]], learning_rate: float = 2e-5, n_epochs: int = 1, num_warmup_steps: int = None, batch_size: int = 16) -> None
|
||||
def train(training_data: List[Dict[str, Any]],
|
||||
learning_rate: float = 2e-5,
|
||||
n_epochs: int = 1,
|
||||
num_warmup_steps: int = None,
|
||||
batch_size: int = 16) -> None
|
||||
```
|
||||
|
||||
Trains/adapts the underlying embedding model.
|
||||
@ -1507,7 +1731,22 @@ Xiong, Wenhan, et. al. (2020): "Answering complex open-domain questions with mul
|
||||
#### MultihopEmbeddingRetriever.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, num_iterations: int = 2, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = [])
|
||||
def __init__(document_store: BaseDocumentStore,
|
||||
embedding_model: str,
|
||||
model_version: Optional[str] = None,
|
||||
num_iterations: int = 2,
|
||||
use_gpu: bool = True,
|
||||
batch_size: int = 32,
|
||||
max_seq_len: int = 512,
|
||||
model_format: str = "farm",
|
||||
pooling_strategy: str = "reduce_mean",
|
||||
emb_extraction_layer: int = -1,
|
||||
top_k: int = 10,
|
||||
progress_bar: bool = True,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
scale_score: bool = True,
|
||||
embed_meta_fields: List[str] = [])
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -1563,7 +1802,13 @@ performance if your titles contain meaningful information for retrieval
|
||||
#### MultihopEmbeddingRetriever.retrieve
|
||||
|
||||
```python
|
||||
def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document]
|
||||
def retrieve(query: str,
|
||||
filters: Optional[Dict[str, Union[Dict, List, str, int, float,
|
||||
bool]]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
scale_score: bool = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -1647,12 +1892,18 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
|
||||
#### MultihopEmbeddingRetriever.retrieve\_batch
|
||||
|
||||
```python
|
||||
def retrieve_batch(queries: List[str], filters: Optional[
|
||||
Union[
|
||||
Dict[str, Union[Dict, List, str, int, float, bool]],
|
||||
List[Dict[str, Union[Dict, List, str, int, float, bool]]],
|
||||
]
|
||||
] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
|
||||
def retrieve_batch(queries: List[str],
|
||||
filters: Optional[Union[Dict[str, Union[Dict, List, str,
|
||||
int, float, bool]],
|
||||
List[Dict[str,
|
||||
Union[Dict, List, str,
|
||||
int, float,
|
||||
bool]]], ]] = None,
|
||||
top_k: Optional[int] = None,
|
||||
index: str = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
batch_size: Optional[int] = None,
|
||||
scale_score: bool = None) -> List[List[Document]]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
@ -1759,7 +2010,10 @@ The generated SPARQL query is executed on a knowledge graph.
|
||||
#### Text2SparqlRetriever.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(knowledge_graph, model_name_or_path, top_k: int = 1, use_auth_token: Optional[Union[str, bool]] = None)
|
||||
def __init__(knowledge_graph,
|
||||
model_name_or_path,
|
||||
top_k: int = 1,
|
||||
use_auth_token: Optional[Union[str, bool]] = None)
|
||||
```
|
||||
|
||||
Init the Retriever by providing a knowledge graph and a pre-trained BART model
|
||||
|
||||
@ -18,7 +18,8 @@ Abstract class for Summarizer
|
||||
|
||||
```python
|
||||
@abstractmethod
|
||||
def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document]
|
||||
def predict(documents: List[Document],
|
||||
generate_single_summary: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Abstract method for creating a summary.
|
||||
@ -87,7 +88,19 @@ See the up-to-date list of available models on
|
||||
#### TransformersSummarizer.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: str = "google/pegasus-xsum", model_version: Optional[str] = None, tokenizer: Optional[str] = None, max_length: int = 200, min_length: int = 5, use_gpu: bool = True, clean_up_tokenization_spaces: bool = True, separator_for_single_summary: str = " ", generate_single_summary: bool = False, batch_size: int = 16, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
def __init__(model_name_or_path: str = "google/pegasus-xsum",
|
||||
model_version: Optional[str] = None,
|
||||
tokenizer: Optional[str] = None,
|
||||
max_length: int = 200,
|
||||
min_length: int = 5,
|
||||
use_gpu: bool = True,
|
||||
clean_up_tokenization_spaces: bool = True,
|
||||
separator_for_single_summary: str = " ",
|
||||
generate_single_summary: bool = False,
|
||||
batch_size: int = 16,
|
||||
progress_bar: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
Load a Summarization model from Transformers.
|
||||
@ -129,7 +142,8 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### TransformersSummarizer.predict
|
||||
|
||||
```python
|
||||
def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document]
|
||||
def predict(documents: List[Document],
|
||||
generate_single_summary: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Produce the summarization from the supplied documents.
|
||||
@ -154,7 +168,11 @@ the original, not summarized text
|
||||
#### TransformersSummarizer.predict\_batch
|
||||
|
||||
```python
|
||||
def predict_batch(documents: Union[List[Document], List[List[Document]]], generate_single_summary: Optional[bool] = None, batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]]
|
||||
def predict_batch(
|
||||
documents: Union[List[Document], List[List[Document]]],
|
||||
generate_single_summary: Optional[bool] = None,
|
||||
batch_size: Optional[int] = None
|
||||
) -> Union[List[Document], List[List[Document]]]
|
||||
```
|
||||
|
||||
Produce the summarization from the supplied documents.
|
||||
|
||||
@ -18,7 +18,13 @@ Abstract class for a Translator component that translates either a query or a do
|
||||
|
||||
```python
|
||||
@abstractmethod
|
||||
def translate(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]
|
||||
def translate(
|
||||
results: List[Dict[str, Any]] = None,
|
||||
query: Optional[str] = None,
|
||||
documents: Optional[Union[List[Document], List[Answer], List[str],
|
||||
List[Dict[str, Any]]]] = None,
|
||||
dict_key: Optional[str] = None
|
||||
) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]
|
||||
```
|
||||
|
||||
Translate the passed query or a list of documents from language A to B.
|
||||
@ -28,7 +34,12 @@ Translate the passed query or a list of documents from language A to B.
|
||||
#### BaseTranslator.run
|
||||
|
||||
```python
|
||||
def run(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None)
|
||||
def run(results: List[Dict[str, Any]] = None,
|
||||
query: Optional[str] = None,
|
||||
documents: Optional[Union[List[Document], List[Answer], List[str],
|
||||
List[Dict[str, Any]]]] = None,
|
||||
answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||
dict_key: Optional[str] = None)
|
||||
```
|
||||
|
||||
Method that gets executed when this class is used as a Node in a Haystack Pipeline
|
||||
@ -68,7 +79,14 @@ We currently recommend using OPUS models (see __init__() for details)
|
||||
#### TransformersTranslator.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(model_name_or_path: str, tokenizer_name: Optional[str] = None, max_seq_len: Optional[int] = None, clean_up_tokenization_spaces: Optional[bool] = True, use_gpu: bool = True, progress_bar: bool = True, use_auth_token: Optional[Union[str, bool]] = None, devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
def __init__(model_name_or_path: str,
|
||||
tokenizer_name: Optional[str] = None,
|
||||
max_seq_len: Optional[int] = None,
|
||||
clean_up_tokenization_spaces: Optional[bool] = True,
|
||||
use_gpu: bool = True,
|
||||
progress_bar: bool = True,
|
||||
use_auth_token: Optional[Union[str, bool]] = None,
|
||||
devices: Optional[List[Union[str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
Initialize the translator with a model that fits your targeted languages. While we support all seq2seq
|
||||
@ -109,7 +127,13 @@ parameter is not used and a single cpu device is used for inference.
|
||||
#### TransformersTranslator.translate
|
||||
|
||||
```python
|
||||
def translate(results: Optional[List[Dict[str, Any]]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]
|
||||
def translate(
|
||||
results: Optional[List[Dict[str, Any]]] = None,
|
||||
query: Optional[str] = None,
|
||||
documents: Optional[Union[List[Document], List[Answer], List[str],
|
||||
List[Dict[str, Any]]]] = None,
|
||||
dict_key: Optional[str] = None
|
||||
) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]
|
||||
```
|
||||
|
||||
Run the actual translation. You can supply a query or a list of documents. Whatever is supplied will be translated.
|
||||
@ -126,7 +150,14 @@ Run the actual translation. You can supply a query or a list of documents. Whate
|
||||
#### TransformersTranslator.translate\_batch
|
||||
|
||||
```python
|
||||
def translate_batch(queries: Optional[List[str]] = None, documents: Optional[Union[List[Document], List[Answer], List[List[Document]], List[List[Answer]]]] = None, batch_size: Optional[int] = None) -> List[Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]]
|
||||
def translate_batch(
|
||||
queries: Optional[List[str]] = None,
|
||||
documents: Optional[Union[List[Document], List[Answer],
|
||||
List[List[Document]],
|
||||
List[List[Answer]]]] = None,
|
||||
batch_size: Optional[int] = None
|
||||
) -> List[Union[str, List[Document], List[Answer], List[str], List[Dict[
|
||||
str, Any]]]]
|
||||
```
|
||||
|
||||
Run the actual translation. You can supply a single query, a list of queries or a list (of lists) of documents.
|
||||
|
||||
@ -7,7 +7,9 @@
|
||||
#### print\_answers
|
||||
|
||||
```python
|
||||
def print_answers(results: dict, details: str = "all", max_text_len: Optional[int] = None)
|
||||
def print_answers(results: dict,
|
||||
details: str = "all",
|
||||
max_text_len: Optional[int] = None)
|
||||
```
|
||||
|
||||
Utility function to print results of Haystack pipelines
|
||||
@ -27,7 +29,10 @@ None
|
||||
#### print\_documents
|
||||
|
||||
```python
|
||||
def print_documents(results: dict, max_text_len: Optional[int] = None, print_name: bool = True, print_meta: bool = False)
|
||||
def print_documents(results: dict,
|
||||
max_text_len: Optional[int] = None,
|
||||
print_name: bool = True,
|
||||
print_meta: bool = False)
|
||||
```
|
||||
|
||||
Utility that prints a compressed representation of the documents returned by a pipeline.
|
||||
@ -90,7 +95,12 @@ Convert the export from the labeling UI to the SQuAD format for training.
|
||||
#### convert\_files\_to\_docs
|
||||
|
||||
```python
|
||||
def convert_files_to_docs(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def convert_files_to_docs(
|
||||
dir_path: str,
|
||||
clean_func: Optional[Callable] = None,
|
||||
split_paragraphs: bool = False,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Documents that can be written to a
|
||||
@ -114,7 +124,13 @@ If you do this, the Document ID will be generated by using the content and the d
|
||||
#### tika\_convert\_files\_to\_docs
|
||||
|
||||
```python
|
||||
def tika_convert_files_to_docs(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, merge_short: bool = True, merge_lowercase: bool = True, id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
def tika_convert_files_to_docs(
|
||||
dir_path: str,
|
||||
clean_func: Optional[Callable] = None,
|
||||
split_paragraphs: bool = False,
|
||||
merge_short: bool = True,
|
||||
merge_lowercase: bool = True,
|
||||
id_hash_keys: Optional[List[str]] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Convert all files (.txt, .pdf) in the sub-directories of the given path to Documents that can be written to a
|
||||
@ -320,7 +336,13 @@ EarlyStopping class instead as long as it implements the method `check_stopping(
|
||||
#### EarlyStopping.\_\_init\_\_
|
||||
|
||||
```python
|
||||
def __init__(head: int = 0, metric: Union[str, Callable] = "loss", save_dir: Optional[str] = None, mode: Literal["min", "max"] = "min", patience: int = 0, min_delta: float = 0.001, min_evals: int = 0)
|
||||
def __init__(head: int = 0,
|
||||
metric: Union[str, Callable] = "loss",
|
||||
save_dir: Optional[str] = None,
|
||||
mode: Literal["min", "max"] = "min",
|
||||
patience: int = 0,
|
||||
min_delta: float = 0.001,
|
||||
min_evals: int = 0)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
@ -29,7 +29,7 @@ def launch_tika(sleep=15, delete_existing=False):
|
||||
_ = subprocess.run([f"docker rm --force {TIKA_CONTAINER_NAME}"], shell=True, stdout=subprocess.DEVNULL)
|
||||
status = subprocess.run(
|
||||
[
|
||||
f"docker start {TIKA_CONTAINER_NAME} > /dev/null 2>&1 || docker run -p 9998:9998 --name {TIKA_CONTAINER_NAME} apache/tika:1.24.1"
|
||||
f"docker start {TIKA_CONTAINER_NAME} > /dev/null 2>&1 || docker run -p 9998:9998 --name {TIKA_CONTAINER_NAME} apache/tika:1.28.4"
|
||||
],
|
||||
shell=True,
|
||||
)
|
||||
@ -102,7 +102,7 @@ class TikaConverter(BaseConverter):
|
||||
if ping.status_code != 200:
|
||||
raise Exception(
|
||||
f"Apache Tika server is not reachable at the URL '{tika_url}'. To run it locally"
|
||||
f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.24.1'"
|
||||
f"with Docker, execute: 'docker run -p 9998:9998 apache/tika:1.28.4'"
|
||||
)
|
||||
self.tika_url = tika_url
|
||||
super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
|
||||
|
||||
@ -46,7 +46,7 @@ def launch_opensearch(sleep=15, delete_existing=False):
|
||||
_ = subprocess.run([f"docker rm --force {OPENSEARCH_CONTAINER_NAME}"], shell=True, stdout=subprocess.DEVNULL)
|
||||
status = subprocess.run(
|
||||
[
|
||||
f'docker start {OPENSEARCH_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" --name {OPENSEARCH_CONTAINER_NAME} opensearchproject/opensearch:1.2.4'
|
||||
f'docker start {OPENSEARCH_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" --name {OPENSEARCH_CONTAINER_NAME} opensearchproject/opensearch:1.3.5'
|
||||
],
|
||||
shell=True,
|
||||
)
|
||||
@ -65,7 +65,7 @@ def launch_weaviate(sleep=15):
|
||||
logger.debug("Starting Weaviate ...")
|
||||
status = subprocess.run(
|
||||
[
|
||||
f"docker start {WEAVIATE_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --name {WEAVIATE_CONTAINER_NAME} semitechnologies/weaviate:1.11.0"
|
||||
f"docker start {WEAVIATE_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d -p 8080:8080 --env AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED='true' --env PERSISTENCE_DATA_PATH='/var/lib/weaviate' --name {WEAVIATE_CONTAINER_NAME} semitechnologies/weaviate:1.14.0"
|
||||
],
|
||||
shell=True,
|
||||
)
|
||||
|
||||
@ -62,9 +62,9 @@ dependencies = [
|
||||
"mmh3", # fast hashing function (murmurhash3)
|
||||
"quantulum3", # quantities extraction from text
|
||||
"posthog", # telemetry
|
||||
"azure-ai-formrecognizer==3.2.0b2", # forms reader
|
||||
"azure-ai-formrecognizer>=3.2.0b2", # forms reader
|
||||
# audio's espnet-model-zoo requires huggingface-hub version <0.8 while we need >=0.5 to be able to use create_repo in FARMReader
|
||||
"huggingface-hub<0.8.0,>=0.5.0",
|
||||
"huggingface-hub>=0.5.0",
|
||||
|
||||
# Preprocessing
|
||||
"more_itertools", # for windowing
|
||||
@ -168,9 +168,9 @@ preprocessing = [
|
||||
"python-magic-bin; platform_system == 'Windows'", # Needs to be installed without python-magic, otherwise Windows CI gets stuck.
|
||||
]
|
||||
ocr = [
|
||||
"pytesseract==0.3.7",
|
||||
"pytesseract>0.3.7",
|
||||
"pillow",
|
||||
"pdf2image==1.14.0",
|
||||
"pdf2image>1.14",
|
||||
]
|
||||
onnx = [
|
||||
"onnxruntime",
|
||||
@ -207,14 +207,7 @@ dev = [
|
||||
# Code formatting
|
||||
"black[jupyter]==22.6.0",
|
||||
# Documentation
|
||||
"pydoc-markdown==4.5.1", # FIXME Unpin!
|
||||
# azure-core is a dependency of azure-ai-formrecognizer
|
||||
# In order to stop malicious pip backtracking during pip install farm-haystack[all] documented in https://github.com/deepset-ai/haystack/issues/2280
|
||||
# we have to resolve a dependency version conflict ourself.
|
||||
# azure-core>=1.23 conflicts with pydoc-markdown's dependency on databind>=1.5.0 which itself requires typing-extensions<4.0.0
|
||||
# azure-core>=1.23 needs typing-extensions>=4.0.1
|
||||
# pip unfortunately backtracks into the databind direction ultimately getting lost.
|
||||
"azure-core<1.23",
|
||||
"pydoc-markdown",
|
||||
"mkdocs",
|
||||
"jupytercontrib",
|
||||
"watchdog", # ==1.0.2
|
||||
|
||||
@ -423,7 +423,7 @@ def weaviate_fixture():
|
||||
print("Starting Weaviate servers ...")
|
||||
status = subprocess.run(["docker rm haystack_test_weaviate"], shell=True)
|
||||
status = subprocess.run(
|
||||
["docker run -d --name haystack_test_weaviate -p 8080:8080 semitechnologies/weaviate:1.11.0"], shell=True
|
||||
["docker run -d --name haystack_test_weaviate -p 8080:8080 semitechnologies/weaviate:1.14.1"], shell=True
|
||||
)
|
||||
if status.returncode:
|
||||
raise Exception("Failed to launch Weaviate. Please check docker container logs.")
|
||||
@ -460,7 +460,7 @@ def tika_fixture():
|
||||
raise Exception("Unable to connect Tika. Please check tika endpoint {0}.".format(tika_url))
|
||||
except:
|
||||
print("Starting Tika ...")
|
||||
status = subprocess.run(["docker run -d --name tika -p 9998:9998 apache/tika:1.24.1"], shell=True)
|
||||
status = subprocess.run(["docker run -d --name tika -p 9998:9998 apache/tika:1.28.4"], shell=True)
|
||||
if status.returncode:
|
||||
raise Exception("Failed to launch Tika. Please check docker container logs.")
|
||||
time.sleep(30)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user