mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-24 13:38:53 +00:00
Automate updates docstrings tutorials (#1461)
* remove not needed githab actions and reactivate docstrings and tutorial generation * test workflow * update pydoc version * update python version * update watchdog * move to latest version pydoc-markdown * remove version check * Add latest docstring and tutorial changes * remove test workflow * test for param docstrings * pin pydoc-markdown version * add test workflow * pin watchdog version * Add latest docstring and tutorial changes * update original workflow and delete test Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
172de1c05f
commit
39845c0624
24
.github/workflows/deploy_website.yml
vendored
24
.github/workflows/deploy_website.yml
vendored
@ -1,24 +0,0 @@
|
||||
name: Deploy website
|
||||
|
||||
# Controls when the action will run. Triggers the workflow on push
|
||||
# events but only for the master branch
|
||||
on:
|
||||
push:
|
||||
branches: [ master, benchmarks ]
|
||||
|
||||
jobs:
|
||||
# This workflow contains a single job called "build"
|
||||
build:
|
||||
# The type of runner that the job will run on
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
||||
# Creates dispatch event for haystack-website repo
|
||||
- name: Repository Dispatch
|
||||
uses: peter-evans/repository-dispatch@v1
|
||||
with:
|
||||
token: ${{ secrets.PUBLIC_REPO_ACCESS_TOKEN }}
|
||||
repository: deepset-ai/haystack-website
|
||||
event-type: deploy-website
|
||||
client-payload: '{}'
|
||||
26
.github/workflows/deploy_website_staging.yml
vendored
26
.github/workflows/deploy_website_staging.yml
vendored
@ -1,26 +0,0 @@
|
||||
name: Deploy website
|
||||
|
||||
# Controls when the action will run. Triggers the workflow on push
|
||||
# events but only for the master branch
|
||||
on:
|
||||
push:
|
||||
branches-ignore:
|
||||
- master
|
||||
- benchmarks
|
||||
|
||||
jobs:
|
||||
# This workflow contains a single job called "build"
|
||||
build:
|
||||
# The type of runner that the job will run on
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
||||
# Creates dispatch event for haystack-website repo
|
||||
- name: Repository Dispatch
|
||||
uses: peter-evans/repository-dispatch@v1
|
||||
with:
|
||||
token: ${{ secrets.PUBLIC_REPO_ACCESS_TOKEN }}
|
||||
repository: deepset-ai/haystack-website
|
||||
event-type: deploy-website-staging
|
||||
client-payload: '{"ref": "${{ github.ref }}"}'
|
||||
7
.github/workflows/update_docs.yml
vendored
7
.github/workflows/update_docs.yml
vendored
@ -20,17 +20,18 @@ jobs:
|
||||
persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal token
|
||||
fetch-depth: 0 # otherwise, you will failed to push refs to dest repo
|
||||
|
||||
- name: Set up Python 3.7
|
||||
- name: Set up Python 3.8.10
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.7
|
||||
python-version: 3.8.10
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install 'pydoc-markdown>=3.0.0,<4.0.0'
|
||||
pip install pydoc-markdown==3.11.0
|
||||
pip install mkdocs
|
||||
pip install jupytercontrib
|
||||
pip install watchdog==1.0.2
|
||||
|
||||
# Generates the docstrings and tutorials so that we have the latest for the deployment
|
||||
- name: Generate Docstrings and Tutorials
|
||||
|
||||
@ -14,10 +14,9 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus
|
||||
```python
|
||||
| from haystack.connector import Crawler
|
||||
|
|
||||
| crawler = Crawler()
|
||||
| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/docs/
|
||||
| docs = crawler.crawl(urls=["https://haystack.deepset.ai/docs/latest/get_startedmd"],
|
||||
| output_dir="crawled_files",
|
||||
| crawler = Crawler(output_dir="crawled_files")
|
||||
| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
|
||||
| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
|
||||
| filter_urls= ["haystack\.deepset\.ai\/docs\/"])
|
||||
```
|
||||
|
||||
@ -35,10 +34,10 @@ Init object with basic params for crawling (can be overwritten later).
|
||||
- `output_dir`: Path for the directory to store files
|
||||
- `urls`: List of http(s) address(es) (can also be supplied later when calling crawl())
|
||||
- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
|
||||
0: Only initial list of urls
|
||||
1: Follow links found on the initial URLs (but no further)
|
||||
0: Only initial list of urls
|
||||
1: Follow links found on the initial URLs (but no further)
|
||||
- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
|
||||
All URLs not matching at least one of the regular expressions will be dropped.
|
||||
All URLs not matching at least one of the regular expressions will be dropped.
|
||||
- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
|
||||
|
||||
<a name="crawler.Crawler.crawl"></a>
|
||||
@ -73,7 +72,7 @@ List of paths where the crawled webpages got stored
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, **kwargs) -> Tuple[Dict, str]
|
||||
| run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False) -> Tuple[Dict, str]
|
||||
```
|
||||
|
||||
Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
|
||||
@ -88,7 +87,7 @@ Method to be executed when the Crawler is used as a Node within a Haystack pipel
|
||||
- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
|
||||
All URLs not matching at least one of the regular expressions will be dropped.
|
||||
- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
|
||||
- `return_documents`: Return json files content
|
||||
- `return_documents`: Return json files content
|
||||
|
||||
**Returns**:
|
||||
|
||||
|
||||
@ -131,7 +131,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite')
|
||||
| __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat")
|
||||
```
|
||||
|
||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||
@ -181,6 +181,8 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea
|
||||
overwrite: Update any existing documents with the same ID when adding documents.
|
||||
fail: an error is raised if the document ID of the document being added already
|
||||
exists.
|
||||
- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the
|
||||
ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does.
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
|
||||
#### get\_document\_by\_id
|
||||
@ -467,17 +469,48 @@ Delete documents in an index. All documents are deleted if no filters are passed
|
||||
|
||||
None
|
||||
|
||||
<a name="elasticsearch.OpenSearchDocumentStore"></a>
|
||||
## OpenSearchDocumentStore Objects
|
||||
|
||||
```python
|
||||
class OpenSearchDocumentStore(ElasticsearchDocumentStore)
|
||||
```
|
||||
|
||||
Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service.
|
||||
|
||||
In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using
|
||||
the KNN plugin that can scale to a large number of documents.
|
||||
|
||||
<a name="elasticsearch.OpenSearchDocumentStore.query_by_embedding"></a>
|
||||
#### query\_by\_embedding
|
||||
|
||||
```python
|
||||
| query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query_emb`: Embedding of the query (e.g. gathered from DPR)
|
||||
- `filters`: Optional filters to narrow down the search space.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
- `top_k`: How many documents to return
|
||||
- `index`: Index name for storing the docs and metadata
|
||||
- `return_embedding`: To return document embedding
|
||||
|
||||
**Returns**:
|
||||
|
||||
|
||||
|
||||
<a name="elasticsearch.OpenDistroElasticsearchDocumentStore"></a>
|
||||
## OpenDistroElasticsearchDocumentStore Objects
|
||||
|
||||
```python
|
||||
class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore)
|
||||
class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore)
|
||||
```
|
||||
|
||||
Document Store using the Open Distro for Elasticsearch. It is compatible with the AWS Elasticsearch Service.
|
||||
|
||||
In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using
|
||||
the KNN plugin that can scale to a large number of documents.
|
||||
A DocumentStore which has an Open Distro for Elasticsearch service behind it.
|
||||
|
||||
<a name="memory"></a>
|
||||
# Module memory
|
||||
@ -1717,3 +1750,21 @@ Delete documents in an index. All documents are deleted if no filters are passed
|
||||
|
||||
None
|
||||
|
||||
<a name="weaviate.WeaviateDocumentStore.delete_documents"></a>
|
||||
#### delete\_documents
|
||||
|
||||
```python
|
||||
| delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
|
||||
```
|
||||
|
||||
Delete documents in an index. All documents are deleted if no filters are passed.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Index name to delete the document from.
|
||||
- `filters`: Optional filters to narrow down the documents to be deleted.
|
||||
|
||||
**Returns**:
|
||||
|
||||
None
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
## EvalDocuments Objects
|
||||
|
||||
```python
|
||||
class EvalDocuments()
|
||||
class EvalDocuments(BaseComponent)
|
||||
```
|
||||
|
||||
This is a pipeline node that should be placed after a node that returns a List of Document, e.g., Retriever or
|
||||
@ -13,13 +13,13 @@ Ranker, in order to assess its performance. Performance metrics are stored in th
|
||||
sample passes through it. To view the results of the evaluation, call EvalDocuments.print(). Note that results
|
||||
from this Node may differ from that when calling Retriever.eval() since that is a closed domain evaluation. Have
|
||||
a look at our evaluation tutorial for more info about open vs closed domain eval (
|
||||
https://haystack.deepset.ai/docs/latest/tutorial5md).
|
||||
https://haystack.deepset.ai/tutorials/evaluation).
|
||||
|
||||
<a name="eval.EvalDocuments.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(debug: bool = False, open_domain: bool = True, top_k_eval_documents: int = 10, name="EvalDocuments")
|
||||
| __init__(debug: bool = False, open_domain: bool = True, top_k: int = 10)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -33,7 +33,7 @@ https://haystack.deepset.ai/docs/latest/tutorial5md).
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(documents, labels: dict, top_k_eval_documents: Optional[int] = None, **kwargs)
|
||||
| run(documents: List[Document], labels: List[Label], top_k: Optional[int] = None)
|
||||
```
|
||||
|
||||
Run this node on one sample and its labels
|
||||
@ -51,7 +51,7 @@ Print the evaluation results
|
||||
## EvalAnswers Objects
|
||||
|
||||
```python
|
||||
class EvalAnswers()
|
||||
class EvalAnswers(BaseComponent)
|
||||
```
|
||||
|
||||
This is a pipeline node that should be placed after a Reader in order to assess the performance of the Reader
|
||||
@ -59,26 +59,37 @@ individually or to assess the extractive QA performance of the whole pipeline. P
|
||||
this class and updated as each sample passes through it. To view the results of the evaluation, call EvalAnswers.print().
|
||||
Note that results from this Node may differ from that when calling Reader.eval()
|
||||
since that is a closed domain evaluation. Have a look at our evaluation tutorial for more info about
|
||||
open vs closed domain eval (https://haystack.deepset.ai/docs/latest/tutorial5md).
|
||||
open vs closed domain eval (https://haystack.deepset.ai/tutorials/evaluation).
|
||||
|
||||
<a name="eval.EvalAnswers.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(skip_incorrect_retrieval: bool = True, open_domain: bool = True, debug: bool = False)
|
||||
| __init__(skip_incorrect_retrieval: bool = True, open_domain: bool = True, sas_model: str = None, debug: bool = False)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `skip_incorrect_retrieval`: When set to True, this eval will ignore the cases where the retriever returned no correct documents
|
||||
- `open_domain`: When True, extracted answers are evaluated purely on string similarity rather than the position of the extracted answer
|
||||
- `sas_model`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric.
|
||||
The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps.
|
||||
Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture.
|
||||
More info in the paper: https://arxiv.org/abs/2108.06130
|
||||
Models:
|
||||
- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data.
|
||||
Not all cross encoders can be used because of different return types.
|
||||
If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class
|
||||
- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
||||
- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large"
|
||||
- Large model for German only: "deepset/gbert-large-sts"
|
||||
- `debug`: When True, a record of each sample and its evaluation will be stored in EvalAnswers.log
|
||||
|
||||
<a name="eval.EvalAnswers.run"></a>
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(labels, answers, **kwargs)
|
||||
| run(labels: List[Label], answers: List[dict], correct_retrieval: bool)
|
||||
```
|
||||
|
||||
Run this node on one sample and its labels
|
||||
@ -92,3 +103,24 @@ Run this node on one sample and its labels
|
||||
|
||||
Print the evaluation results
|
||||
|
||||
<a name="eval.semantic_answer_similarity"></a>
|
||||
#### semantic\_answer\_similarity
|
||||
|
||||
```python
|
||||
semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2") -> Tuple[List[float],List[float]]
|
||||
```
|
||||
|
||||
Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.
|
||||
Returns per QA pair a) the similarity of the most likely prediction (top 1) to all available gold labels
|
||||
b) the highest similarity of all predictions to gold labels
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `predictions`: Predicted answers as list of multiple preds per question
|
||||
- `gold_labels`: Labels as list of multiple possible answers per question
|
||||
- `sas_model_name_or_path`: SentenceTransformers semantic textual similarity model, should be path or string
|
||||
pointing to downloadable models.
|
||||
|
||||
|
||||
:return top_1_sas, top_k_sas
|
||||
|
||||
|
||||
@ -77,6 +77,15 @@ class FileTypeClassifier(BaseComponent)
|
||||
|
||||
Route files in an Indexing Pipeline to corresponding file converters.
|
||||
|
||||
<a name="base.FileTypeClassifier.run"></a>
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(file_paths: Union[Path, List[Path]])
|
||||
```
|
||||
|
||||
Return the output based on file extension
|
||||
|
||||
<a name="txt"></a>
|
||||
# Module txt
|
||||
|
||||
@ -272,3 +281,60 @@ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreade
|
||||
"xef\xac\x81c" (see test cases). That's why we keep "Latin 1" as default here.
|
||||
(See list of available encodings by running `pdftotext -listencodings` in the terminal)
|
||||
|
||||
<a name="pdf.PDFToTextOCRConverter"></a>
|
||||
## PDFToTextOCRConverter Objects
|
||||
|
||||
```python
|
||||
class PDFToTextOCRConverter(BaseConverter)
|
||||
```
|
||||
|
||||
<a name="pdf.PDFToTextOCRConverter.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"])
|
||||
```
|
||||
|
||||
Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
- `valid_languages`: validate languages from a list of languages supported by tessarect
|
||||
(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
|
||||
<a name="pdf.PDFToTextOCRConverter.convert"></a>
|
||||
#### convert
|
||||
|
||||
```python
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> Dict[str, Any]
|
||||
```
|
||||
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
|
||||
File converters may extract file meta like name or size. In addition to it, user
|
||||
supplied meta data like author, url, external IDs can be supplied as a dictionary.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `file_path`: path of the file to convert
|
||||
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
|
||||
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
- `encoding`: Select the file encoding (default is `utf-8`)
|
||||
|
||||
|
||||
@ -96,7 +96,7 @@ See https://huggingface.co/transformers/model_doc/rag.html for more details
|
||||
'facebook/rag-token-nq', 'facebook/rag-sequence-nq'.
|
||||
See https://huggingface.co/models for full list of available models.
|
||||
- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
|
||||
- `retriever`: `DensePassageRetriever` used to embedded passage
|
||||
- `retriever`: `DensePassageRetriever` used to embedded passages for the docs passed to `predict()`. This is optional and is only needed if the docs you pass don't already contain embeddings in `Document.embedding`.
|
||||
- `generator_type`: Which RAG generator implementation to use? RAG-TOKEN or RAG-SEQUENCE
|
||||
- `top_k`: Number of independently generated text to return
|
||||
- `max_length`: Maximum length of generated text
|
||||
|
||||
@ -1,11 +1,69 @@
|
||||
<a name="pipeline"></a>
|
||||
# Module pipeline
|
||||
|
||||
<a name="pipeline.BasePipeline"></a>
|
||||
## BasePipeline Objects
|
||||
|
||||
```python
|
||||
class BasePipeline()
|
||||
```
|
||||
|
||||
<a name="pipeline.BasePipeline.load_from_yaml"></a>
|
||||
#### load\_from\_yaml
|
||||
|
||||
```python
|
||||
| @classmethod
|
||||
| load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True)
|
||||
```
|
||||
|
||||
Load Pipeline from a YAML file defining the individual components and how they're tied together to form
|
||||
a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must
|
||||
be passed.
|
||||
|
||||
Here's a sample configuration:
|
||||
|
||||
```yaml
|
||||
| version: '0.8'
|
||||
|
|
||||
| components: # define all the building-blocks for Pipeline
|
||||
| - name: MyReader # custom-name for the component; helpful for visualization & debugging
|
||||
| type: FARMReader # Haystack Class name for the component
|
||||
| params:
|
||||
| no_ans_boost: -10
|
||||
| model_name_or_path: deepset/roberta-base-squad2
|
||||
| - name: MyESRetriever
|
||||
| type: ElasticsearchRetriever
|
||||
| params:
|
||||
| document_store: MyDocumentStore # params can reference other components defined in the YAML
|
||||
| custom_query: null
|
||||
| - name: MyDocumentStore
|
||||
| type: ElasticsearchDocumentStore
|
||||
| params:
|
||||
| index: haystack_test
|
||||
|
|
||||
| pipelines: # multiple Pipelines can be defined using the components from above
|
||||
| - name: my_query_pipeline # a simple extractive-qa Pipeline
|
||||
| nodes:
|
||||
| - name: MyESRetriever
|
||||
| inputs: [Query]
|
||||
| - name: MyReader
|
||||
| inputs: [MyESRetriever]
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `path`: path of the YAML file.
|
||||
- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set.
|
||||
- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example,
|
||||
to change index name param for an ElasticsearchDocumentStore, an env
|
||||
variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
|
||||
`_` sign must be used to specify nested hierarchical properties.
|
||||
|
||||
<a name="pipeline.Pipeline"></a>
|
||||
## Pipeline Objects
|
||||
|
||||
```python
|
||||
class Pipeline()
|
||||
class Pipeline(BasePipeline)
|
||||
```
|
||||
|
||||
Pipeline brings together building blocks to build a complex search pipeline with Haystack & user-defined components.
|
||||
@ -63,6 +121,37 @@ Set the component for a node in the Pipeline.
|
||||
- `name`: The name of the node.
|
||||
- `component`: The component object to be set at the node.
|
||||
|
||||
<a name="pipeline.Pipeline.get_nodes_by_class"></a>
|
||||
#### get\_nodes\_by\_class
|
||||
|
||||
```python
|
||||
| get_nodes_by_class(class_type) -> List[Any]
|
||||
```
|
||||
|
||||
Gets all nodes in the pipeline that are an instance of a certain class (incl. subclasses).
|
||||
This is for example helpful if you loaded a pipeline and then want to interact directly with the document store.
|
||||
Example:
|
||||
| from haystack.document_store.base import BaseDocumentStore
|
||||
| INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
|
||||
| res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore)
|
||||
|
||||
**Returns**:
|
||||
|
||||
List of components that are an instance the requested class
|
||||
|
||||
<a name="pipeline.Pipeline.get_document_store"></a>
|
||||
#### get\_document\_store
|
||||
|
||||
```python
|
||||
| get_document_store() -> Optional[BaseDocumentStore]
|
||||
```
|
||||
|
||||
Return the document store object used in the current pipeline.
|
||||
|
||||
**Returns**:
|
||||
|
||||
Instance of DocumentStore or None
|
||||
|
||||
<a name="pipeline.Pipeline.draw"></a>
|
||||
#### draw
|
||||
|
||||
@ -231,6 +320,19 @@ Initialize a Pipeline for Extractive Question Answering.
|
||||
- `reader`: Reader instance
|
||||
- `retriever`: Retriever instance
|
||||
|
||||
<a name="pipeline.ExtractiveQAPipeline.run"></a>
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(query: str, params: Optional[dict] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: the query string.
|
||||
- `params`: params for the `retriever` and `reader`. For instance,
|
||||
params={"retriever": {"top_k": 10}, "reader": {"top_k": 5}}
|
||||
|
||||
<a name="pipeline.DocumentSearchPipeline"></a>
|
||||
## DocumentSearchPipeline Objects
|
||||
|
||||
@ -251,6 +353,18 @@ Initialize a Pipeline for semantic document search.
|
||||
|
||||
- `retriever`: Retriever instance
|
||||
|
||||
<a name="pipeline.DocumentSearchPipeline.run"></a>
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(query: str, params: Optional[dict] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: the query string.
|
||||
- `params`: params for the `retriever` and `reader`. For instance, params={"retriever": {"top_k": 10}}
|
||||
|
||||
<a name="pipeline.GenerativeQAPipeline"></a>
|
||||
## GenerativeQAPipeline Objects
|
||||
|
||||
@ -272,6 +386,19 @@ Initialize a Pipeline for Generative Question Answering.
|
||||
- `generator`: Generator instance
|
||||
- `retriever`: Retriever instance
|
||||
|
||||
<a name="pipeline.GenerativeQAPipeline.run"></a>
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(query: str, params: Optional[dict] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: the query string.
|
||||
- `params`: params for the `retriever` and `generator`. For instance,
|
||||
params={"retriever": {"top_k": 10}, "generator": {"top_k": 5}}
|
||||
|
||||
<a name="pipeline.SearchSummarizationPipeline"></a>
|
||||
## SearchSummarizationPipeline Objects
|
||||
|
||||
@ -283,7 +410,7 @@ class SearchSummarizationPipeline(BaseStandardPipeline)
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(summarizer: BaseSummarizer, retriever: BaseRetriever)
|
||||
| __init__(summarizer: BaseSummarizer, retriever: BaseRetriever, return_in_answer_format: bool = False)
|
||||
```
|
||||
|
||||
Initialize a Pipeline that retrieves documents for a query and then summarizes those documents.
|
||||
@ -292,23 +419,22 @@ Initialize a Pipeline that retrieves documents for a query and then summarizes t
|
||||
|
||||
- `summarizer`: Summarizer instance
|
||||
- `retriever`: Retriever instance
|
||||
- `return_in_answer_format`: Whether the results should be returned as documents (False) or in the answer
|
||||
format used in other QA pipelines (True). With the latter, you can use this
|
||||
pipeline as a "drop-in replacement" for other QA pipelines.
|
||||
|
||||
<a name="pipeline.SearchSummarizationPipeline.run"></a>
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(query: str, filters: Optional[Dict] = None, top_k_retriever: Optional[int] = None, generate_single_summary: Optional[bool] = None, return_in_answer_format: bool = False)
|
||||
| run(query: str, params: Optional[dict] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: Your search query
|
||||
- `filters`:
|
||||
- `top_k_retriever`: Number of top docs the retriever should pass to the summarizer.
|
||||
The higher this value, the slower your pipeline.
|
||||
- `generate_single_summary`: Whether to generate single summary from all retrieved docs (True) or one per doc (False).
|
||||
- `return_in_answer_format`: Whether the results should be returned as documents (False) or in the answer format used in other QA pipelines (True).
|
||||
With the latter, you can use this pipeline as a "drop-in replacement" for other QA pipelines.
|
||||
- `query`: the query string.
|
||||
- `params`: params for the `retriever` and `summarizer`. For instance,
|
||||
params={"retriever": {"top_k": 10}, "summarizer": {"generate_single_summary": True}}
|
||||
|
||||
<a name="pipeline.FAQPipeline"></a>
|
||||
## FAQPipeline Objects
|
||||
@ -330,6 +456,18 @@ Initialize a Pipeline for finding similar FAQs using semantic document search.
|
||||
|
||||
- `retriever`: Retriever instance
|
||||
|
||||
<a name="pipeline.FAQPipeline.run"></a>
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(query: str, params: Optional[dict] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: the query string.
|
||||
- `params`: params for the `retriever`. For instance, params={"retriever": {"top_k": 10}}
|
||||
|
||||
<a name="pipeline.TranslationWrapperPipeline"></a>
|
||||
## TranslationWrapperPipeline Objects
|
||||
|
||||
@ -356,6 +494,45 @@ Wrap a given `pipeline` with the `input_translator` and `output_translator`.
|
||||
- `pipeline`: The pipeline object (e.g. ExtractiveQAPipeline) you want to "wrap".
|
||||
Note that pipelines with split or merge nodes are currently not supported.
|
||||
|
||||
<a name="pipeline.QuestionGenerationPipeline"></a>
|
||||
## QuestionGenerationPipeline Objects
|
||||
|
||||
```python
|
||||
class QuestionGenerationPipeline(BaseStandardPipeline)
|
||||
```
|
||||
|
||||
A simple pipeline that takes documents as input and generates
|
||||
questions that it thinks can be answered by the documents.
|
||||
|
||||
<a name="pipeline.RetrieverQuestionGenerationPipeline"></a>
|
||||
## RetrieverQuestionGenerationPipeline Objects
|
||||
|
||||
```python
|
||||
class RetrieverQuestionGenerationPipeline(BaseStandardPipeline)
|
||||
```
|
||||
|
||||
A simple pipeline that takes a query as input, performs retrieval, and then generates
|
||||
questions that it thinks can be answered by the retrieved documents.
|
||||
|
||||
<a name="pipeline.QuestionAnswerGenerationPipeline"></a>
|
||||
## QuestionAnswerGenerationPipeline Objects
|
||||
|
||||
```python
|
||||
class QuestionAnswerGenerationPipeline(BaseStandardPipeline)
|
||||
```
|
||||
|
||||
This is a pipeline which takes a document as input, generates questions that the model thinks can be answered by
|
||||
this document, and then performs question answering of this questions using that single document.
|
||||
|
||||
<a name="pipeline.RootNode"></a>
|
||||
## RootNode Objects
|
||||
|
||||
```python
|
||||
class RootNode(BaseComponent)
|
||||
```
|
||||
|
||||
RootNode feeds inputs together with corresponding params to a Pipeline.
|
||||
|
||||
<a name="pipeline.SklearnQueryClassifier"></a>
|
||||
## SklearnQueryClassifier Objects
|
||||
|
||||
@ -387,21 +564,21 @@ and the further processing can be customized. You can define this by connecting
|
||||
|
||||
Pass your own `Sklearn` binary classification model or use one of the following pretrained ones:
|
||||
1) Keywords vs. Questions/Statements (Default)
|
||||
query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/model.pickle)
|
||||
query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/vectorizer.pickle)
|
||||
query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle)
|
||||
query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle)
|
||||
output_1 => question/statement
|
||||
output_2 => keyword query
|
||||
[Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/readme.txt)
|
||||
[Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt)
|
||||
|
||||
|
||||
2) Questions vs. Statements
|
||||
query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier\_statements/model.pickle)
|
||||
query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier\_statements/vectorizer.pickle)
|
||||
query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle)
|
||||
query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle)
|
||||
output_1 => question
|
||||
output_2 => statement
|
||||
[Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier\_statements/readme.txt)
|
||||
[Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt)
|
||||
|
||||
See also the [tutorial](https://haystack.deepset.ai/docs/latest/tutorial11md) on pipelines.
|
||||
See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines.
|
||||
|
||||
<a name="pipeline.SklearnQueryClassifier.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
@ -409,9 +586,9 @@ and the further processing can be customized. You can define this by connecting
|
||||
```python
|
||||
| __init__(model_name_or_path: Union[
|
||||
| str, Any
|
||||
| ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/model.pickle", vectorizer_name_or_path: Union[
|
||||
| ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", vectorizer_name_or_path: Union[
|
||||
| str, Any
|
||||
| ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/vectorizer.pickle")
|
||||
| ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle")
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -454,16 +631,16 @@ and the further processing can be customized. You can define this by connecting
|
||||
model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection"
|
||||
output_1 => question/statement
|
||||
output_2 => keyword query
|
||||
[Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/readme.txt)
|
||||
[Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt)
|
||||
|
||||
|
||||
2) Questions vs. Statements
|
||||
`model_name_or_path`="shahrukhx01/question-vs-statement-classifier"
|
||||
output_1 => question
|
||||
output_2 => statement
|
||||
[Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier\_statements/readme.txt)
|
||||
[Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt)
|
||||
|
||||
See also the [tutorial](https://haystack.deepset.ai/docs/latest/tutorial11md) on pipelines.
|
||||
See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines.
|
||||
|
||||
<a name="pipeline.TransformersQueryClassifier.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
@ -508,3 +685,175 @@ The node allows multiple join modes:
|
||||
to each retriever score. This param is not compatible with the `concatenate` join_mode.
|
||||
- `top_k_join`: Limit documents to top_k based on the resulting scores of the join.
|
||||
|
||||
<a name="pipeline.RayPipeline"></a>
|
||||
## RayPipeline Objects
|
||||
|
||||
```python
|
||||
class RayPipeline(Pipeline)
|
||||
```
|
||||
|
||||
Ray (https://ray.io) is a framework for distributed computing.
|
||||
|
||||
Ray allows distributing a Pipeline's components across a cluster of machines. The individual components of a
|
||||
Pipeline can be independently scaled. For instance, an extractive QA Pipeline deployment can have three replicas
|
||||
of the Reader and a single replica for the Retriever. It enables efficient resource utilization by horizontally
|
||||
scaling Components.
|
||||
|
||||
To set the number of replicas, add `replicas` in the YAML config for the node in a pipeline:
|
||||
|
||||
```yaml
|
||||
| components:
|
||||
| ...
|
||||
|
|
||||
| pipelines:
|
||||
| - name: ray_query_pipeline
|
||||
| type: RayPipeline
|
||||
| nodes:
|
||||
| - name: ESRetriever
|
||||
| replicas: 2 # number of replicas to create on the Ray cluster
|
||||
| inputs: [ Query ]
|
||||
```
|
||||
|
||||
A RayPipeline can only be created with a YAML Pipeline config.
|
||||
>>> from haystack.pipeline import RayPipeline
|
||||
>>> pipeline = RayPipeline.load_from_yaml(path="my_pipelines.yaml", pipeline_name="my_query_pipeline")
|
||||
>>> pipeline.run(query="What is the capital of Germany?")
|
||||
|
||||
By default, RayPipelines creates an instance of RayServe locally. To connect to an existing Ray instance,
|
||||
set the `address` parameter when creating the RayPipeline instance.
|
||||
|
||||
<a name="pipeline.RayPipeline.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(address: str = None, **kwargs)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `address`: The IP address for the Ray cluster. If set to None, a local Ray instance is started.
|
||||
- `kwargs`: Optional parameters for initializing Ray.
|
||||
|
||||
<a name="pipeline.RayPipeline.load_from_yaml"></a>
|
||||
#### load\_from\_yaml
|
||||
|
||||
```python
|
||||
| @classmethod
|
||||
| load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, address: Optional[str] = None, **kwargs, ,)
|
||||
```
|
||||
|
||||
Load Pipeline from a YAML file defining the individual components and how they're tied together to form
|
||||
a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must
|
||||
be passed.
|
||||
|
||||
Here's a sample configuration:
|
||||
|
||||
```yaml
|
||||
| version: '0.8'
|
||||
|
|
||||
| components: # define all the building-blocks for Pipeline
|
||||
| - name: MyReader # custom-name for the component; helpful for visualization & debugging
|
||||
| type: FARMReader # Haystack Class name for the component
|
||||
| params:
|
||||
| no_ans_boost: -10
|
||||
| model_name_or_path: deepset/roberta-base-squad2
|
||||
| - name: MyESRetriever
|
||||
| type: ElasticsearchRetriever
|
||||
| params:
|
||||
| document_store: MyDocumentStore # params can reference other components defined in the YAML
|
||||
| custom_query: null
|
||||
| - name: MyDocumentStore
|
||||
| type: ElasticsearchDocumentStore
|
||||
| params:
|
||||
| index: haystack_test
|
||||
|
|
||||
| pipelines: # multiple Pipelines can be defined using the components from above
|
||||
| - name: my_query_pipeline # a simple extractive-qa Pipeline
|
||||
| nodes:
|
||||
| - name: MyESRetriever
|
||||
| inputs: [Query]
|
||||
| - name: MyReader
|
||||
| inputs: [MyESRetriever]
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `path`: path of the YAML file.
|
||||
- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set.
|
||||
- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example,
|
||||
to change index name param for an ElasticsearchDocumentStore, an env
|
||||
variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an
|
||||
`_` sign must be used to specify nested hierarchical properties.
|
||||
- `address`: The IP address for the Ray cluster. If set to None, a local Ray instance is started.
|
||||
|
||||
<a name="pipeline._RayDeploymentWrapper"></a>
|
||||
## \_RayDeploymentWrapper Objects
|
||||
|
||||
```python
|
||||
class _RayDeploymentWrapper()
|
||||
```
|
||||
|
||||
Ray Serve supports calling of __init__ methods on the Classes to create "deployment" instances.
|
||||
|
||||
In case of Haystack, some Components like Retrievers have complex init methods that needs objects
|
||||
like Document Stores.
|
||||
|
||||
This wrapper class encapsulates the initialization of Components. Given a Component Class
|
||||
name, it creates an instance using the YAML Pipeline config.
|
||||
|
||||
<a name="pipeline._RayDeploymentWrapper.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(pipeline_config: dict, component_name: str)
|
||||
```
|
||||
|
||||
Create an instance of Component.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `pipeline_config`: Pipeline YAML parsed as a dict.
|
||||
- `component_name`: Component Class name.
|
||||
|
||||
<a name="pipeline._RayDeploymentWrapper.__call__"></a>
|
||||
#### \_\_call\_\_
|
||||
|
||||
```python
|
||||
| __call__(*args, **kwargs)
|
||||
```
|
||||
|
||||
Ray calls this method which is then re-directed to the corresponding component's run().
|
||||
|
||||
<a name="pipeline.MostSimilarDocumentsPipeline"></a>
|
||||
## MostSimilarDocumentsPipeline Objects
|
||||
|
||||
```python
|
||||
class MostSimilarDocumentsPipeline(BaseStandardPipeline)
|
||||
```
|
||||
|
||||
<a name="pipeline.MostSimilarDocumentsPipeline.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(document_store: BaseDocumentStore)
|
||||
```
|
||||
|
||||
Initialize a Pipeline for finding the most similar documents to a given document.
|
||||
This pipeline can be helpful if you already show a relevant document to your end users and they want to search for just similar ones.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `document_store`: Document Store instance with already stored embeddings.
|
||||
|
||||
<a name="pipeline.MostSimilarDocumentsPipeline.run"></a>
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(document_ids: List[str], top_k: int = 5)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `document_ids`: document ids
|
||||
- `top_k`: How many documents id to return against single document
|
||||
|
||||
|
||||
@ -12,7 +12,7 @@ class BasePreProcessor(BaseComponent)
|
||||
#### process
|
||||
|
||||
```python
|
||||
| process(document: dict, clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict]
|
||||
| process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict]
|
||||
```
|
||||
|
||||
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
|
||||
@ -31,7 +31,7 @@ class PreProcessor(BasePreProcessor)
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, split_by: str = "word", split_length: int = 1000, split_overlap: int = 0, split_respect_sentence_boundary: bool = True)
|
||||
| __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, split_by: str = "word", split_length: int = 1000, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, language: str = "en")
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -54,15 +54,16 @@ class PreProcessor(BasePreProcessor)
|
||||
- `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set
|
||||
to True, the individual split will always have complete sentences &
|
||||
the number of words will be <= split_length.
|
||||
- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more.
|
||||
|
||||
<a name="preprocessor.PreProcessor.process"></a>
|
||||
#### process
|
||||
|
||||
```python
|
||||
| process(document: dict, clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
|
||||
| process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict]
|
||||
```
|
||||
|
||||
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
|
||||
Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents.
|
||||
|
||||
<a name="preprocessor.PreProcessor.clean"></a>
|
||||
#### clean
|
||||
|
||||
@ -62,13 +62,25 @@ class FARMRanker(BaseRanker)
|
||||
```
|
||||
|
||||
Transformer based model for Document Re-ranking using the TextPairClassifier of FARM framework (https://github.com/deepset-ai/FARM).
|
||||
Re-Ranking can be used on top of a retriever to boost the performance for document search. This is particularly useful if the retriever has a high recall but is bad in sorting the documents by relevance.
|
||||
While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interface remains the same.
|
||||
FARMRanker handles Cross-Encoder models that internally use two logits and output the classifier's probability of label "1" as similarity score.
|
||||
This includes TextPairClassification models trained within FARM.
|
||||
In contrast, SentenceTransformersRanker handles Cross-Encoder models that use a single logit as similarity score.
|
||||
https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transformers
|
||||
|
||||
| With a FARMRanker, you can:
|
||||
|
||||
- directly get predictions via predict()
|
||||
- fine-tune the model on TextPair data via train()
|
||||
|
||||
Usage example:
|
||||
...
|
||||
retriever = ElasticsearchRetriever(document_store=document_store)
|
||||
ranker = FARMRanker(model_name_or_path="deepset/gbert-base-germandpr-reranking")
|
||||
p = Pipeline()
|
||||
p.add_node(component=retriever, name="ESRetriever", inputs=["Query"])
|
||||
p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"])
|
||||
|
||||
<a name="farm.FARMRanker.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
@ -108,7 +120,7 @@ Fine-tune a model on a TextPairClassification dataset. Options:
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `data_dir`: Path to directory containing your training data in SQuAD style
|
||||
- `data_dir`: Path to directory containing your training data
|
||||
- `train_filename`: Filename of training data
|
||||
- `dev_filename`: Filename of dev / eval data
|
||||
- `test_filename`: Filename of test data
|
||||
@ -187,7 +199,7 @@ List of dictionaries containing query and ranked list of Document
|
||||
#### predict
|
||||
|
||||
```python
|
||||
| predict(query: str, documents: List[Document], top_k: Optional[int] = None)
|
||||
| predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Use loaded ranker model to re-rank the supplied list of Document.
|
||||
|
||||
@ -8,6 +8,15 @@
|
||||
class BaseReader(BaseComponent)
|
||||
```
|
||||
|
||||
<a name="base.BaseReader.run_batch"></a>
|
||||
#### run\_batch
|
||||
|
||||
```python
|
||||
| run_batch(query_doc_list: List[Dict], top_k: Optional[int] = None)
|
||||
```
|
||||
|
||||
A unoptimized implementation of running Reader queries in batch
|
||||
|
||||
<a name="base.BaseReader.timing"></a>
|
||||
#### timing
|
||||
|
||||
@ -39,7 +48,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0)
|
||||
| __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -158,7 +167,7 @@ Saves the Reader model so that it can be reused at a later point in time.
|
||||
|
||||
Use loaded QA model to find answers for a list of queries in each query's supplied list of Document.
|
||||
|
||||
Returns list of dictionaries containing answers sorted by (desc.) probability
|
||||
Returns list of dictionaries containing answers sorted by (desc.) score
|
||||
|
||||
**Arguments**:
|
||||
|
||||
@ -179,7 +188,7 @@ List of dictionaries containing query and answers
|
||||
|
||||
Use loaded QA model to find answers for a query in the supplied list of Document.
|
||||
|
||||
Returns dictionaries containing answers sorted by (desc.) probability.
|
||||
Returns dictionaries containing answers sorted by (desc.) score.
|
||||
Example:
|
||||
```python
|
||||
|{
|
||||
@ -189,8 +198,7 @@ Example:
|
||||
| 'context': " She travels with her father, Eddard, to King's Landing when he is ",
|
||||
| 'offset_answer_start': 147,
|
||||
| 'offset_answer_end': 154,
|
||||
| 'probability': 0.9787139466668613,
|
||||
| 'score': None,
|
||||
| 'score': 0.9787139466668613,
|
||||
| 'document_id': '1337'
|
||||
| },...
|
||||
| ]
|
||||
@ -211,7 +219,7 @@ Dict containing query and answers
|
||||
#### eval\_on\_file
|
||||
|
||||
```python
|
||||
| eval_on_file(data_dir: str, test_filename: str, device: str)
|
||||
| eval_on_file(data_dir: str, test_filename: str, device: Optional[str] = None)
|
||||
```
|
||||
|
||||
Performs evaluation on a SQuAD-formatted file.
|
||||
@ -226,14 +234,14 @@ Returns a dict containing the following metrics:
|
||||
:type data_dir: Path or str
|
||||
- `test_filename`: The name of the file containing the test data in SQuAD format.
|
||||
:type test_filename: str
|
||||
- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
|
||||
- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default.
|
||||
:type device: str
|
||||
|
||||
<a name="farm.FARMReader.eval"></a>
|
||||
#### eval
|
||||
|
||||
```python
|
||||
| eval(document_store: BaseDocumentStore, device: str, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", calibrate_conf_scores: bool = False)
|
||||
| eval(document_store: BaseDocumentStore, device: Optional[str] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", calibrate_conf_scores: bool = False)
|
||||
```
|
||||
|
||||
Performs evaluation on evaluation documents in the DocumentStore.
|
||||
@ -245,7 +253,7 @@ Returns a dict containing the following metrics:
|
||||
**Arguments**:
|
||||
|
||||
- `document_store`: DocumentStore containing the evaluation documents
|
||||
- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
|
||||
- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default.
|
||||
- `label_index`: Index/Table name where labeled questions are stored
|
||||
- `doc_index`: Index/Table name where documents that are used for evaluation are stored
|
||||
- `label_origin`: Field name where the gold labels are stored
|
||||
@ -255,7 +263,7 @@ Returns a dict containing the following metrics:
|
||||
#### calibrate\_confidence\_scores
|
||||
|
||||
```python
|
||||
| calibrate_confidence_scores(document_store: BaseDocumentStore, device: str, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label")
|
||||
| calibrate_confidence_scores(document_store: BaseDocumentStore, device: Optional[str] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label")
|
||||
```
|
||||
|
||||
Calibrates confidence scores on evaluation documents in the DocumentStore.
|
||||
@ -263,7 +271,7 @@ Calibrates confidence scores on evaluation documents in the DocumentStore.
|
||||
**Arguments**:
|
||||
|
||||
- `document_store`: DocumentStore containing the evaluation documents
|
||||
- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda".
|
||||
- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default.
|
||||
- `label_index`: Index/Table name where labeled questions are stored
|
||||
- `doc_index`: Index/Table name where documents that are used for evaluation are stored
|
||||
- `label_origin`: Field name where the gold labels are stored
|
||||
@ -276,7 +284,7 @@ Calibrates confidence scores on evaluation documents in the DocumentStore.
|
||||
```
|
||||
|
||||
Use loaded QA model to find answers for a question in the supplied list of Document.
|
||||
Returns dictionaries containing answers sorted by (desc.) probability.
|
||||
Returns dictionaries containing answers sorted by (desc.) score.
|
||||
Example:
|
||||
```python
|
||||
|{
|
||||
@ -286,8 +294,7 @@ Example:
|
||||
| 'context': " She travels with her father, Eddard, to King's Landing when he is ",
|
||||
| 'offset_answer_start': 147,
|
||||
| 'offset_answer_end': 154,
|
||||
| 'probability': 0.9787139466668613,
|
||||
| 'score': None,
|
||||
| 'score': 0.9787139466668613,
|
||||
| 'document_id': '1337'
|
||||
| },...
|
||||
| ]
|
||||
@ -395,7 +402,7 @@ If you would like to set no_answer_boost, use a `FARMReader`.
|
||||
|
||||
Use loaded QA model to find answers for a query in the supplied list of Document.
|
||||
|
||||
Returns dictionaries containing answers sorted by (desc.) probability.
|
||||
Returns dictionaries containing answers sorted by (desc.) score.
|
||||
Example:
|
||||
|
||||
```python
|
||||
@ -406,8 +413,7 @@ Example:
|
||||
| 'context': " She travels with her father, Eddard, to King's Landing when he is ",
|
||||
| 'offset_answer_start': 147,
|
||||
| 'offset_answer_end': 154,
|
||||
| 'probability': 0.9787139466668613,
|
||||
| 'score': None,
|
||||
| 'score': 0.9787139466668613,
|
||||
| 'document_id': '1337'
|
||||
| },...
|
||||
| ]
|
||||
|
||||
@ -238,7 +238,7 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", progress_bar: bool = True)
|
||||
| __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[int, str, torch.device]]] = None)
|
||||
```
|
||||
|
||||
Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
|
||||
@ -270,8 +270,8 @@ The checkpoint format matches huggingface transformers' model format
|
||||
- `max_seq_len_query`: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down."
|
||||
- `max_seq_len_passage`: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down."
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `use_gpu`: Whether to use gpu or not
|
||||
- `batch_size`: Number of questions or passages to encode at once
|
||||
- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
|
||||
- `batch_size`: Number of questions or passages to encode at once. In case of multiple gpus, this will be the total batch size.
|
||||
- `embed_title`: Whether to concatenate title and passage to a text pair that is then used to create the embedding.
|
||||
This is the approach used in the original paper and is likely to improve performance if your
|
||||
titles contain meaningful information for retrieval (topic, entities etc.) .
|
||||
@ -283,8 +283,12 @@ The checkpoint format matches huggingface transformers' model format
|
||||
If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`.
|
||||
- `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training.
|
||||
Options: `dot_product` (Default) or `cosine`
|
||||
- `global_loss_buffer_size`: Buffer size for all_gather() in DDP.
|
||||
Increase if errors like "encoded data exceeds max_size ..." come up
|
||||
- `progress_bar`: Whether to show a tqdm progress bar or not.
|
||||
Can be helpful to disable in production deployments to keep the logs clean.
|
||||
- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]).
|
||||
As multi-GPU training is currently not implemented for DPR, training will only use the first device provided in this list.
|
||||
|
||||
<a name="dense.DensePassageRetriever.retrieve"></a>
|
||||
#### retrieve
|
||||
|
||||
@ -110,7 +110,7 @@ https://huggingface.co/models?filter=summarization
|
||||
#### predict
|
||||
|
||||
```python
|
||||
| predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document]
|
||||
| predict(documents: List[Document], generate_single_summary: Optional[bool] = None, truncation: bool = True) -> List[Document]
|
||||
```
|
||||
|
||||
Produce the summarization from the supplied documents.
|
||||
@ -123,6 +123,7 @@ These document can for example be retrieved via the Retriever.
|
||||
If set to "True", all docs will be joined to a single string that will then
|
||||
be summarized.
|
||||
Important: The summary will depend on the order of the supplied documents!
|
||||
- `truncation`: Truncate to a maximum length accepted by the model
|
||||
|
||||
**Returns**:
|
||||
|
||||
|
||||
@ -15,7 +15,7 @@ Abstract class for a Translator component that translates either a query or a do
|
||||
|
||||
```python
|
||||
| @abstractmethod
|
||||
| translate(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None, **kwargs) -> Union[str, List[Document], List[str], List[Dict[str, Any]]]
|
||||
| translate(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[str], List[Dict[str, Any]]]
|
||||
```
|
||||
|
||||
Translate the passed query or a list of documents from language A to B.
|
||||
@ -24,7 +24,7 @@ Translate the passed query or a list of documents from language A to B.
|
||||
#### run
|
||||
|
||||
```python
|
||||
| run(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None, **kwargs)
|
||||
| run(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None)
|
||||
```
|
||||
|
||||
Method that gets executed when this class is used as a Node in a Haystack Pipeline
|
||||
@ -89,7 +89,7 @@ They also have a few multilingual models that support multiple languages at once
|
||||
#### translate
|
||||
|
||||
```python
|
||||
| translate(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None, **kwargs) -> Union[str, List[Document], List[str], List[Dict[str, Any]]]
|
||||
| translate(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[str], List[Dict[str, Any]]]
|
||||
```
|
||||
|
||||
Run the actual translation. You can supply a query or a list of documents. Whatever is supplied will be translated.
|
||||
|
||||
@ -224,14 +224,16 @@ pipe = ExtractiveQAPipeline(reader, retriever)
|
||||
|
||||
```python
|
||||
# You can configure how many candidates the reader and retriever shall return
|
||||
# The higher top_k_retriever, the better (but also the slower) your answers.
|
||||
prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)
|
||||
# The higher the top_k, the better (but also the slower) your answers.
|
||||
prediction = pipe.run(
|
||||
query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
|
||||
# prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)
|
||||
# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
|
||||
# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})
|
||||
```
|
||||
|
||||
|
||||
|
||||
@ -112,7 +112,7 @@ from haystack.reader import FARMReader
|
||||
# Initialize DocumentStore and index documents
|
||||
launch_es()
|
||||
document_store = ElasticsearchDocumentStore()
|
||||
document_store.delete_all_documents()
|
||||
document_store.delete_documents()
|
||||
document_store.write_documents(got_dicts)
|
||||
|
||||
# Initialize Sparse retriever
|
||||
@ -138,8 +138,7 @@ from haystack.pipeline import ExtractiveQAPipeline
|
||||
p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever)
|
||||
res = p_extractive_premade.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
top_k_retriever=10,
|
||||
top_k_reader=5
|
||||
params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}},
|
||||
)
|
||||
print_answers(res, details="minimal")
|
||||
```
|
||||
@ -153,7 +152,7 @@ from haystack.pipeline import DocumentSearchPipeline
|
||||
p_retrieval = DocumentSearchPipeline(es_retriever)
|
||||
res = p_retrieval.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
top_k_retriever=10
|
||||
params={"Retriever": {"top_k": 10}},
|
||||
)
|
||||
print_documents(res, max_text_len=200)
|
||||
```
|
||||
@ -177,7 +176,7 @@ rag_generator = RAGenerator()
|
||||
p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever)
|
||||
res = p_generator.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
top_k_retriever=10
|
||||
params={"Retriever": {"top_k": 10}}
|
||||
)
|
||||
print_answers(res, details="minimal")
|
||||
|
||||
@ -223,8 +222,7 @@ p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"])
|
||||
# Now we can run it
|
||||
res = p_extractive.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
top_k_retriever=10,
|
||||
top_k_reader=5
|
||||
params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
|
||||
)
|
||||
print_answers(res, details="minimal")
|
||||
p_extractive.draw("pipeline_extractive.png")
|
||||
@ -255,7 +253,7 @@ p_ensemble.draw("pipeline_ensemble.png")
|
||||
# Run pipeline
|
||||
res = p_ensemble.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
top_k_retriever=5 #This is top_k per retriever
|
||||
params={"DPRRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}}
|
||||
)
|
||||
print_answers(res, details="minimal")
|
||||
```
|
||||
@ -266,8 +264,9 @@ Nodes are relatively simple objects
|
||||
and we encourage our users to design their own if they don't see on that fits their use case
|
||||
|
||||
The only requirements are:
|
||||
- Add a method run(self, **kwargs) to your class. **kwargs will contain the output from the previous node in your graph.
|
||||
- Do whatever you want within run() (e.g. reformatting the query)
|
||||
- Create a class that inherits `BaseComponent`.
|
||||
- Add a method run() to your class. Add the mandatory and optional arguments it needs to process. These arguments must be passed as input to the pipeline, inside `params`, or output by preceding nodes.
|
||||
- Add processing logic inside the run() (e.g. reformatting the query).
|
||||
- Return a tuple that contains your output data (for the next node)
|
||||
and the name of the outgoing edge (by default "output_1" for nodes that have one output)
|
||||
- Add a class attribute outgoing_edges = 1 that defines the number of output options from your node. You only need a higher number here if you have a decision node (see below).
|
||||
@ -276,12 +275,16 @@ Here we have a template for a Node:
|
||||
|
||||
|
||||
```python
|
||||
class NodeTemplate():
|
||||
from haystack import BaseComponent
|
||||
from typing import Optional
|
||||
|
||||
class CustomNode(BaseComponent):
|
||||
outgoing_edges = 1
|
||||
|
||||
def run(self, **kwargs):
|
||||
# Insert code here to manipulate the variables in kwarg
|
||||
return (kwargs, "output_1")
|
||||
def run(self, query: str, my_optional_param: Optional[int]):
|
||||
# process the inputs
|
||||
output = {"my_output": ...}
|
||||
return output, "output_1"
|
||||
```
|
||||
|
||||
## Decision Nodes
|
||||
@ -300,14 +303,14 @@ Below, we define a very naive `QueryClassifier` and show how to use it:
|
||||
|
||||
|
||||
```python
|
||||
class QueryClassifier():
|
||||
class QueryClassifier(BaseComponent):
|
||||
outgoing_edges = 2
|
||||
|
||||
def run(self, **kwargs):
|
||||
if "?" in kwargs["query"]:
|
||||
return (kwargs, "output_2")
|
||||
def run(self, query: str):
|
||||
if "?" in query:
|
||||
return {}, "output_2"
|
||||
else:
|
||||
return (kwargs, "output_1")
|
||||
return {}, "output_1"
|
||||
|
||||
# Here we build the pipeline
|
||||
p_classifier = Pipeline()
|
||||
@ -318,18 +321,12 @@ p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever",
|
||||
p_classifier.draw("pipeline_classifier.png")
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_1 = p_classifier.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
top_k_retriever=10
|
||||
)
|
||||
res_1 = p_classifier.run(query="Who is the father of Arya Stark?")
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_2 = p_classifier.run(
|
||||
query="Arya Stark father",
|
||||
top_k_retriever=10
|
||||
)
|
||||
res_2 = p_classifier.run(query="Arya Stark father")
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_2)
|
||||
```
|
||||
|
||||
@ -102,7 +102,7 @@ from haystack.pipeline import DocumentSearchPipeline
|
||||
p_retrieval = DocumentSearchPipeline(retriever)
|
||||
res = p_retrieval.run(
|
||||
query="Tell me something about Arya Stark?",
|
||||
top_k_retriever=5
|
||||
params={"top_k": 5}
|
||||
)
|
||||
print_documents(res, max_text_len=512)
|
||||
|
||||
@ -138,12 +138,15 @@ pipe = GenerativeQAPipeline(generator, retriever)
|
||||
|
||||
|
||||
```python
|
||||
pipe.run(query="Why did Arya Stark's character get portrayed in a television adaptation?", top_k_retriever=1)
|
||||
pipe.run(
|
||||
query="Why did Arya Stark's character get portrayed in a television adaptation?",
|
||||
params={"Retriever": {"top_k": 1}}
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
pipe.run(query="What kind of character does Arya Stark play?", top_k_retriever=1)
|
||||
pipe.run(query="What kind of character does Arya Stark play?", params={"Retriever": {"top_k": 1}})
|
||||
```
|
||||
|
||||
## About us
|
||||
|
||||
@ -118,7 +118,7 @@ got_dicts = convert_files_to_dicts(
|
||||
# Initialize DocumentStore and index documents
|
||||
launch_es()
|
||||
document_store = ElasticsearchDocumentStore()
|
||||
document_store.delete_all_documents()
|
||||
document_store.delete_documents()
|
||||
document_store.write_documents(got_dicts)
|
||||
|
||||
# Initialize Sparse retriever
|
||||
@ -162,16 +162,14 @@ sklearn_keyword_classifier.draw("pipeline_classifier.png")
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_1 = sklearn_keyword_classifier.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
top_k_retriever=10
|
||||
query="Who is the father of Arya Stark?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_2 = sklearn_keyword_classifier.run(
|
||||
query="arya stark father",
|
||||
top_k_retriever=10
|
||||
query="arya stark father"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_2)
|
||||
@ -183,16 +181,14 @@ print_answers(res_2)
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_3 = sklearn_keyword_classifier.run(
|
||||
query="which country was jon snow filmed ?",
|
||||
top_k_retriever=10
|
||||
query="which country was jon snow filmed ?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_3)
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_4 = sklearn_keyword_classifier.run(
|
||||
query="jon snow country",
|
||||
top_k_retriever=10
|
||||
query="jon snow country"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_4)
|
||||
@ -202,16 +198,14 @@ print_answers(res_4)
|
||||
```python
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_5 = sklearn_keyword_classifier.run(
|
||||
query="who are the younger brothers of arya stark ?",
|
||||
top_k_retriever=10
|
||||
query="who are the younger brothers of arya stark ?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_5)
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_6 = sklearn_keyword_classifier.run(
|
||||
query="arya stark younger brothers",
|
||||
top_k_retriever=10
|
||||
query="arya stark younger brothers"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_6)
|
||||
@ -241,16 +235,14 @@ transformer_keyword_classifier.draw("pipeline_classifier.png")
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_1 = transformer_keyword_classifier.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
top_k_retriever=10
|
||||
query="Who is the father of Arya Stark?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_2 = transformer_keyword_classifier.run(
|
||||
query="arya stark father",
|
||||
top_k_retriever=10
|
||||
query="arya stark father"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_2)
|
||||
@ -262,16 +254,14 @@ print_answers(res_2)
|
||||
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_3 = transformer_keyword_classifier.run(
|
||||
query="which country was jon snow filmed ?",
|
||||
top_k_retriever=10
|
||||
query="which country was jon snow filmed ?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_3)
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_4 = transformer_keyword_classifier.run(
|
||||
query="jon snow country",
|
||||
top_k_retriever=10
|
||||
query="jon snow country"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_4)
|
||||
@ -281,16 +271,14 @@ print_answers(res_4)
|
||||
```python
|
||||
# Run only the dense retriever on the full sentence query
|
||||
res_5 = transformer_keyword_classifier.run(
|
||||
query="who are the younger brothers of arya stark ?",
|
||||
top_k_retriever=10
|
||||
query="who are the younger brothers of arya stark ?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_5)
|
||||
|
||||
# Run only the sparse retriever on a keyword based query
|
||||
res_6 = transformer_keyword_classifier.run(
|
||||
query="arya stark younger brothers",
|
||||
top_k_retriever=10
|
||||
query="arya stark younger brothers"
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
print_answers(res_6)
|
||||
@ -318,16 +306,14 @@ transformer_question_classifier.draw("question_classifier.png")
|
||||
|
||||
# Run only the QA reader on the question query
|
||||
res_1 = transformer_question_classifier.run(
|
||||
query="Who is the father of Arya Stark?",
|
||||
top_k_retriever=10
|
||||
query="Who is the father of Arya Stark?"
|
||||
)
|
||||
print("DPR Results" + "\n" + "="*15)
|
||||
print_answers(res_1)
|
||||
|
||||
# Show only DPR results
|
||||
res_2 = transformer_question_classifier.run(
|
||||
query="Arya Stark was the daughter of a Lord.",
|
||||
top_k_retriever=10
|
||||
query="Arya Stark was the daughter of a Lord."
|
||||
)
|
||||
print("ES Results" + "\n" + "="*15)
|
||||
res_2
|
||||
|
||||
@ -169,14 +169,16 @@ pipe = ExtractiveQAPipeline(reader, retriever)
|
||||
|
||||
```python
|
||||
# You can configure how many candidates the reader and retriever shall return
|
||||
# The higher top_k_retriever, the better (but also the slower) your answers.
|
||||
prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5)
|
||||
# The higher top_k for retriever, the better (but also the slower) your answers.
|
||||
prediction = pipe.run(
|
||||
query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5)
|
||||
# prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5)
|
||||
# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}})
|
||||
# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}})
|
||||
```
|
||||
|
||||
|
||||
|
||||
@ -103,7 +103,7 @@ from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
|
||||
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="",
|
||||
index="document",
|
||||
embedding_field="question_emb",
|
||||
embedding_dim=768,
|
||||
embedding_dim=384,
|
||||
excluded_meta_data=["question_emb"])
|
||||
```
|
||||
|
||||
@ -113,7 +113,7 @@ We can use the `EmbeddingRetriever` for this purpose and specify a model that we
|
||||
|
||||
|
||||
```python
|
||||
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=True)
|
||||
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2", use_gpu=True)
|
||||
```
|
||||
|
||||
### Prepare & Index FAQ data
|
||||
@ -154,7 +154,7 @@ pipe = FAQPipeline(retriever=retriever)
|
||||
|
||||
|
||||
```python
|
||||
prediction = pipe.run(query="How is the virus spreading?", top_k_retriever=10)
|
||||
prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}})
|
||||
print_answers(prediction, details="all")
|
||||
```
|
||||
|
||||
|
||||
@ -219,10 +219,8 @@ results = []
|
||||
for l in labels:
|
||||
res = p.run(
|
||||
query=l.question,
|
||||
top_k_retriever=10,
|
||||
labels=l,
|
||||
top_k_reader=10,
|
||||
index=doc_index,
|
||||
params={"index": doc_index, "Retriever": {"top_k": 10}, "Reader": {"top_k": 5}},
|
||||
)
|
||||
results.append(res)
|
||||
```
|
||||
|
||||
@ -213,8 +213,10 @@ pipe = ExtractiveQAPipeline(reader, retriever)
|
||||
|
||||
```python
|
||||
# You can configure how many candidates the reader and retriever shall return
|
||||
# The higher top_k_retriever, the better (but also the slower) your answers.
|
||||
prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_retriever=10, top_k_reader=5)
|
||||
# The higher top_k for retriever, the better (but also the slower) your answers.
|
||||
prediction = pipe.run(
|
||||
query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
|
||||
@ -128,7 +128,7 @@ The `update_embeddings()` method uses the retriever to create an embedding for e
|
||||
|
||||
```python
|
||||
# Delete existing documents in documents store
|
||||
document_store.delete_all_documents()
|
||||
document_store.delete_documents()
|
||||
|
||||
# Write documents to document store
|
||||
document_store.write_documents(documents)
|
||||
@ -195,7 +195,7 @@ from haystack.pipeline import GenerativeQAPipeline
|
||||
|
||||
pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)
|
||||
for question in QUESTIONS:
|
||||
res = pipe.run(query=question, top_k_generator=1, top_k_retriever=5)
|
||||
res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}})
|
||||
print(res)
|
||||
```
|
||||
|
||||
|
||||
@ -33,11 +33,11 @@ class Crawler(BaseComponent):
|
||||
|
||||
:param output_dir: Path for the directory to store files
|
||||
:param urls: List of http(s) address(es) (can also be supplied later when calling crawl())
|
||||
:param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options:
|
||||
0: Only initial list of urls
|
||||
1: Follow links found on the initial URLs (but no further)
|
||||
:param filter_urls: Optional list of regular expressions that the crawled URLs must comply with.
|
||||
All URLs not matching at least one of the regular expressions will be dropped.
|
||||
:param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options:
|
||||
0: Only initial list of urls
|
||||
1: Follow links found on the initial URLs (but no further)
|
||||
:param filter_urls: Optional list of regular expressions that the crawled URLs must comply with.
|
||||
All URLs not matching at least one of the regular expressions will be dropped.
|
||||
:param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content
|
||||
"""
|
||||
IN_COLAB = "google.colab" in sys.modules
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user