diff --git a/.github/workflows/deploy_website.yml b/.github/workflows/deploy_website.yml deleted file mode 100644 index b7593a3ba..000000000 --- a/.github/workflows/deploy_website.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: Deploy website - -# Controls when the action will run. Triggers the workflow on push -# events but only for the master branch -on: - push: - branches: [ master, benchmarks ] - -jobs: - # This workflow contains a single job called "build" - build: - # The type of runner that the job will run on - runs-on: ubuntu-latest - - steps: - - # Creates dispatch event for haystack-website repo - - name: Repository Dispatch - uses: peter-evans/repository-dispatch@v1 - with: - token: ${{ secrets.PUBLIC_REPO_ACCESS_TOKEN }} - repository: deepset-ai/haystack-website - event-type: deploy-website - client-payload: '{}' diff --git a/.github/workflows/deploy_website_staging.yml b/.github/workflows/deploy_website_staging.yml deleted file mode 100644 index 8b939669b..000000000 --- a/.github/workflows/deploy_website_staging.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: Deploy website - -# Controls when the action will run. Triggers the workflow on push -# events but only for the master branch -on: - push: - branches-ignore: - - master - - benchmarks - -jobs: - # This workflow contains a single job called "build" - build: - # The type of runner that the job will run on - runs-on: ubuntu-latest - - steps: - - # Creates dispatch event for haystack-website repo - - name: Repository Dispatch - uses: peter-evans/repository-dispatch@v1 - with: - token: ${{ secrets.PUBLIC_REPO_ACCESS_TOKEN }} - repository: deepset-ai/haystack-website - event-type: deploy-website-staging - client-payload: '{"ref": "${{ github.ref }}"}' diff --git a/.github/workflows/update_docs.yml b/.github/workflows/update_docs.yml index 2fc455efe..1d54e354d 100644 --- a/.github/workflows/update_docs.yml +++ b/.github/workflows/update_docs.yml @@ -20,17 +20,18 @@ jobs: persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal token fetch-depth: 0 # otherwise, you will failed to push refs to dest repo - - name: Set up Python 3.7 + - name: Set up Python 3.8.10 uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: 3.8.10 - name: Install dependencies run: | python -m pip install --upgrade pip - pip install 'pydoc-markdown>=3.0.0,<4.0.0' + pip install pydoc-markdown==3.11.0 pip install mkdocs pip install jupytercontrib + pip install watchdog==1.0.2 # Generates the docstrings and tutorials so that we have the latest for the deployment - name: Generate Docstrings and Tutorials diff --git a/docs/_src/api/api/crawler.md b/docs/_src/api/api/crawler.md index fcc03f79d..c95af3b07 100644 --- a/docs/_src/api/api/crawler.md +++ b/docs/_src/api/api/crawler.md @@ -14,10 +14,9 @@ Crawl texts from a website so that we can use them later in Haystack as a corpus ```python | from haystack.connector import Crawler | -| crawler = Crawler() -| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/docs/ -| docs = crawler.crawl(urls=["https://haystack.deepset.ai/docs/latest/get_startedmd"], -| output_dir="crawled_files", +| crawler = Crawler(output_dir="crawled_files") +| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/ +| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"], | filter_urls= ["haystack\.deepset\.ai\/docs\/"]) ``` @@ -35,10 +34,10 @@ Init object with basic params for crawling (can be overwritten later). - `output_dir`: Path for the directory to store files - `urls`: List of http(s) address(es) (can also be supplied later when calling crawl()) - `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options: - 0: Only initial list of urls - 1: Follow links found on the initial URLs (but no further) + 0: Only initial list of urls + 1: Follow links found on the initial URLs (but no further) - `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. - All URLs not matching at least one of the regular expressions will be dropped. + All URLs not matching at least one of the regular expressions will be dropped. - `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content @@ -73,7 +72,7 @@ List of paths where the crawled webpages got stored #### run ```python - | run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, **kwargs) -> Tuple[Dict, str] + | run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False) -> Tuple[Dict, str] ``` Method to be executed when the Crawler is used as a Node within a Haystack pipeline. @@ -88,7 +87,7 @@ Method to be executed when the Crawler is used as a Node within a Haystack pipel - `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. All URLs not matching at least one of the regular expressions will be dropped. - `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content -- `return_documents`: Return json files content +- `return_documents`: Return json files content **Returns**: diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 9fde0d43c..c14e82e69 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -131,7 +131,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore) #### \_\_init\_\_ ```python - | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite') + | __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, create_index: bool = True, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False, duplicate_documents: str = 'overwrite', index_type: str = "flat") ``` A DocumentStore using Elasticsearch to store and query the documents for our search. @@ -181,6 +181,8 @@ A DocumentStore using Elasticsearch to store and query the documents for our sea overwrite: Update any existing documents with the same ID when adding documents. fail: an error is raised if the document ID of the document being added already exists. +- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the + ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does. #### get\_document\_by\_id @@ -467,17 +469,48 @@ Delete documents in an index. All documents are deleted if no filters are passed None + +## OpenSearchDocumentStore Objects + +```python +class OpenSearchDocumentStore(ElasticsearchDocumentStore) +``` + +Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service. + +In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using +the KNN plugin that can scale to a large number of documents. + + +#### query\_by\_embedding + +```python + | query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space. + Example: {"name": ["some", "more"], "category": ["only_one"]} +- `top_k`: How many documents to return +- `index`: Index name for storing the docs and metadata +- `return_embedding`: To return document embedding + +**Returns**: + + + ## OpenDistroElasticsearchDocumentStore Objects ```python -class OpenDistroElasticsearchDocumentStore(ElasticsearchDocumentStore) +class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore) ``` -Document Store using the Open Distro for Elasticsearch. It is compatible with the AWS Elasticsearch Service. - -In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using -the KNN plugin that can scale to a large number of documents. +A DocumentStore which has an Open Distro for Elasticsearch service behind it. # Module memory @@ -1717,3 +1750,21 @@ Delete documents in an index. All documents are deleted if no filters are passed None + +#### delete\_documents + +```python + | delete_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. + +**Returns**: + +None + diff --git a/docs/_src/api/api/evaluation.md b/docs/_src/api/api/evaluation.md index 91f3da7ea..1c4a77930 100644 --- a/docs/_src/api/api/evaluation.md +++ b/docs/_src/api/api/evaluation.md @@ -5,7 +5,7 @@ ## EvalDocuments Objects ```python -class EvalDocuments() +class EvalDocuments(BaseComponent) ``` This is a pipeline node that should be placed after a node that returns a List of Document, e.g., Retriever or @@ -13,13 +13,13 @@ Ranker, in order to assess its performance. Performance metrics are stored in th sample passes through it. To view the results of the evaluation, call EvalDocuments.print(). Note that results from this Node may differ from that when calling Retriever.eval() since that is a closed domain evaluation. Have a look at our evaluation tutorial for more info about open vs closed domain eval ( -https://haystack.deepset.ai/docs/latest/tutorial5md). +https://haystack.deepset.ai/tutorials/evaluation). #### \_\_init\_\_ ```python - | __init__(debug: bool = False, open_domain: bool = True, top_k_eval_documents: int = 10, name="EvalDocuments") + | __init__(debug: bool = False, open_domain: bool = True, top_k: int = 10) ``` **Arguments**: @@ -33,7 +33,7 @@ https://haystack.deepset.ai/docs/latest/tutorial5md). #### run ```python - | run(documents, labels: dict, top_k_eval_documents: Optional[int] = None, **kwargs) + | run(documents: List[Document], labels: List[Label], top_k: Optional[int] = None) ``` Run this node on one sample and its labels @@ -51,7 +51,7 @@ Print the evaluation results ## EvalAnswers Objects ```python -class EvalAnswers() +class EvalAnswers(BaseComponent) ``` This is a pipeline node that should be placed after a Reader in order to assess the performance of the Reader @@ -59,26 +59,37 @@ individually or to assess the extractive QA performance of the whole pipeline. P this class and updated as each sample passes through it. To view the results of the evaluation, call EvalAnswers.print(). Note that results from this Node may differ from that when calling Reader.eval() since that is a closed domain evaluation. Have a look at our evaluation tutorial for more info about -open vs closed domain eval (https://haystack.deepset.ai/docs/latest/tutorial5md). +open vs closed domain eval (https://haystack.deepset.ai/tutorials/evaluation). #### \_\_init\_\_ ```python - | __init__(skip_incorrect_retrieval: bool = True, open_domain: bool = True, debug: bool = False) + | __init__(skip_incorrect_retrieval: bool = True, open_domain: bool = True, sas_model: str = None, debug: bool = False) ``` **Arguments**: - `skip_incorrect_retrieval`: When set to True, this eval will ignore the cases where the retriever returned no correct documents - `open_domain`: When True, extracted answers are evaluated purely on string similarity rather than the position of the extracted answer +- `sas_model`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric. + The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps. + Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. + More info in the paper: https://arxiv.org/abs/2108.06130 + Models: + - You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. + Not all cross encoders can be used because of different return types. + If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class + - Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" + - Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" + - Large model for German only: "deepset/gbert-large-sts" - `debug`: When True, a record of each sample and its evaluation will be stored in EvalAnswers.log #### run ```python - | run(labels, answers, **kwargs) + | run(labels: List[Label], answers: List[dict], correct_retrieval: bool) ``` Run this node on one sample and its labels @@ -92,3 +103,24 @@ Run this node on one sample and its labels Print the evaluation results + +#### semantic\_answer\_similarity + +```python +semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2") -> Tuple[List[float],List[float]] +``` + +Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1. +Returns per QA pair a) the similarity of the most likely prediction (top 1) to all available gold labels + b) the highest similarity of all predictions to gold labels + +**Arguments**: + +- `predictions`: Predicted answers as list of multiple preds per question +- `gold_labels`: Labels as list of multiple possible answers per question +- `sas_model_name_or_path`: SentenceTransformers semantic textual similarity model, should be path or string + pointing to downloadable models. + + +:return top_1_sas, top_k_sas + diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index 472afc9c8..e23bddbce 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -77,6 +77,15 @@ class FileTypeClassifier(BaseComponent) Route files in an Indexing Pipeline to corresponding file converters. + +#### run + +```python + | run(file_paths: Union[Path, List[Path]]) +``` + +Return the output based on file extension + # Module txt @@ -272,3 +281,60 @@ Extract text from a .pdf file using the pdftotext library (https://www.xpdfreade "xef\xac\x81c" (see test cases). That's why we keep "Latin 1" as default here. (See list of available encodings by running `pdftotext -listencodings` in the terminal) + +## PDFToTextOCRConverter Objects + +```python +class PDFToTextOCRConverter(BaseConverter) +``` + + +#### \_\_init\_\_ + +```python + | __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"]) +``` + +Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) + +**Arguments**: + +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages supported by tessarect + (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + + +#### convert + +```python + | convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8") -> Dict[str, Any] +``` + +Convert a file to a dictionary containing the text and any associated meta data. + +File converters may extract file meta like name or size. In addition to it, user +supplied meta data like author, url, external IDs can be supplied as a dictionary. + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. +- `encoding`: Select the file encoding (default is `utf-8`) + diff --git a/docs/_src/api/api/generator.md b/docs/_src/api/api/generator.md index 91e62e045..6cb130956 100644 --- a/docs/_src/api/api/generator.md +++ b/docs/_src/api/api/generator.md @@ -96,7 +96,7 @@ See https://huggingface.co/transformers/model_doc/rag.html for more details 'facebook/rag-token-nq', 'facebook/rag-sequence-nq'. See https://huggingface.co/models for full list of available models. - `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. -- `retriever`: `DensePassageRetriever` used to embedded passage +- `retriever`: `DensePassageRetriever` used to embedded passages for the docs passed to `predict()`. This is optional and is only needed if the docs you pass don't already contain embeddings in `Document.embedding`. - `generator_type`: Which RAG generator implementation to use? RAG-TOKEN or RAG-SEQUENCE - `top_k`: Number of independently generated text to return - `max_length`: Maximum length of generated text diff --git a/docs/_src/api/api/pipelines.md b/docs/_src/api/api/pipelines.md index b75422c5d..0fefdbdc8 100644 --- a/docs/_src/api/api/pipelines.md +++ b/docs/_src/api/api/pipelines.md @@ -1,11 +1,69 @@ # Module pipeline + +## BasePipeline Objects + +```python +class BasePipeline() +``` + + +#### load\_from\_yaml + +```python + | @classmethod + | load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) +``` + +Load Pipeline from a YAML file defining the individual components and how they're tied together to form +a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```yaml + | version: '0.8' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: ElasticsearchRetriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | - name: MyReader + | inputs: [MyESRetriever] + ``` + +**Arguments**: + +- `path`: path of the YAML file. +- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example, + to change index name param for an ElasticsearchDocumentStore, an env + variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an + `_` sign must be used to specify nested hierarchical properties. + ## Pipeline Objects ```python -class Pipeline() +class Pipeline(BasePipeline) ``` Pipeline brings together building blocks to build a complex search pipeline with Haystack & user-defined components. @@ -63,6 +121,37 @@ Set the component for a node in the Pipeline. - `name`: The name of the node. - `component`: The component object to be set at the node. + +#### get\_nodes\_by\_class + +```python + | get_nodes_by_class(class_type) -> List[Any] +``` + +Gets all nodes in the pipeline that are an instance of a certain class (incl. subclasses). +This is for example helpful if you loaded a pipeline and then want to interact directly with the document store. +Example: +| from haystack.document_store.base import BaseDocumentStore +| INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) +| res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore) + +**Returns**: + +List of components that are an instance the requested class + + +#### get\_document\_store + +```python + | get_document_store() -> Optional[BaseDocumentStore] +``` + +Return the document store object used in the current pipeline. + +**Returns**: + +Instance of DocumentStore or None + #### draw @@ -231,6 +320,19 @@ Initialize a Pipeline for Extractive Question Answering. - `reader`: Reader instance - `retriever`: Retriever instance + +#### run + +```python + | run(query: str, params: Optional[dict] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever` and `reader`. For instance, + params={"retriever": {"top_k": 10}, "reader": {"top_k": 5}} + ## DocumentSearchPipeline Objects @@ -251,6 +353,18 @@ Initialize a Pipeline for semantic document search. - `retriever`: Retriever instance + +#### run + +```python + | run(query: str, params: Optional[dict] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever` and `reader`. For instance, params={"retriever": {"top_k": 10}} + ## GenerativeQAPipeline Objects @@ -272,6 +386,19 @@ Initialize a Pipeline for Generative Question Answering. - `generator`: Generator instance - `retriever`: Retriever instance + +#### run + +```python + | run(query: str, params: Optional[dict] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever` and `generator`. For instance, + params={"retriever": {"top_k": 10}, "generator": {"top_k": 5}} + ## SearchSummarizationPipeline Objects @@ -283,7 +410,7 @@ class SearchSummarizationPipeline(BaseStandardPipeline) #### \_\_init\_\_ ```python - | __init__(summarizer: BaseSummarizer, retriever: BaseRetriever) + | __init__(summarizer: BaseSummarizer, retriever: BaseRetriever, return_in_answer_format: bool = False) ``` Initialize a Pipeline that retrieves documents for a query and then summarizes those documents. @@ -292,23 +419,22 @@ Initialize a Pipeline that retrieves documents for a query and then summarizes t - `summarizer`: Summarizer instance - `retriever`: Retriever instance +- `return_in_answer_format`: Whether the results should be returned as documents (False) or in the answer + format used in other QA pipelines (True). With the latter, you can use this + pipeline as a "drop-in replacement" for other QA pipelines. #### run ```python - | run(query: str, filters: Optional[Dict] = None, top_k_retriever: Optional[int] = None, generate_single_summary: Optional[bool] = None, return_in_answer_format: bool = False) + | run(query: str, params: Optional[dict] = None) ``` **Arguments**: -- `query`: Your search query -- `filters`: -- `top_k_retriever`: Number of top docs the retriever should pass to the summarizer. - The higher this value, the slower your pipeline. -- `generate_single_summary`: Whether to generate single summary from all retrieved docs (True) or one per doc (False). -- `return_in_answer_format`: Whether the results should be returned as documents (False) or in the answer format used in other QA pipelines (True). - With the latter, you can use this pipeline as a "drop-in replacement" for other QA pipelines. +- `query`: the query string. +- `params`: params for the `retriever` and `summarizer`. For instance, + params={"retriever": {"top_k": 10}, "summarizer": {"generate_single_summary": True}} ## FAQPipeline Objects @@ -330,6 +456,18 @@ Initialize a Pipeline for finding similar FAQs using semantic document search. - `retriever`: Retriever instance + +#### run + +```python + | run(query: str, params: Optional[dict] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever`. For instance, params={"retriever": {"top_k": 10}} + ## TranslationWrapperPipeline Objects @@ -356,6 +494,45 @@ Wrap a given `pipeline` with the `input_translator` and `output_translator`. - `pipeline`: The pipeline object (e.g. ExtractiveQAPipeline) you want to "wrap". Note that pipelines with split or merge nodes are currently not supported. + +## QuestionGenerationPipeline Objects + +```python +class QuestionGenerationPipeline(BaseStandardPipeline) +``` + +A simple pipeline that takes documents as input and generates +questions that it thinks can be answered by the documents. + + +## RetrieverQuestionGenerationPipeline Objects + +```python +class RetrieverQuestionGenerationPipeline(BaseStandardPipeline) +``` + +A simple pipeline that takes a query as input, performs retrieval, and then generates +questions that it thinks can be answered by the retrieved documents. + + +## QuestionAnswerGenerationPipeline Objects + +```python +class QuestionAnswerGenerationPipeline(BaseStandardPipeline) +``` + +This is a pipeline which takes a document as input, generates questions that the model thinks can be answered by +this document, and then performs question answering of this questions using that single document. + + +## RootNode Objects + +```python +class RootNode(BaseComponent) +``` + +RootNode feeds inputs together with corresponding params to a Pipeline. + ## SklearnQueryClassifier Objects @@ -387,21 +564,21 @@ and the further processing can be customized. You can define this by connecting Pass your own `Sklearn` binary classification model or use one of the following pretrained ones: 1) Keywords vs. Questions/Statements (Default) - query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/model.pickle) - query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/vectorizer.pickle) + query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle) + query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle) output_1 => question/statement output_2 => keyword query - [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/readme.txt) + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) 2) Questions vs. Statements - query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier\_statements/model.pickle) - query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier\_statements/vectorizer.pickle) + query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle) + query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle) output_1 => question output_2 => statement - [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier\_statements/readme.txt) + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) - See also the [tutorial](https://haystack.deepset.ai/docs/latest/tutorial11md) on pipelines. + See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines. #### \_\_init\_\_ @@ -409,9 +586,9 @@ and the further processing can be customized. You can define this by connecting ```python | __init__(model_name_or_path: Union[ | str, Any - | ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/model.pickle", vectorizer_name_or_path: Union[ + | ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", vectorizer_name_or_path: Union[ | str, Any - | ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/vectorizer.pickle") + | ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle") ``` **Arguments**: @@ -454,16 +631,16 @@ and the further processing can be customized. You can define this by connecting model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection" output_1 => question/statement output_2 => keyword query - [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier/readme.txt) + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) 2) Questions vs. Statements `model_name_or_path`="shahrukhx01/question-vs-statement-classifier" output_1 => question output_2 => statement - [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost\_query\_classifier\_statements/readme.txt) + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) - See also the [tutorial](https://haystack.deepset.ai/docs/latest/tutorial11md) on pipelines. + See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines. #### \_\_init\_\_ @@ -508,3 +685,175 @@ The node allows multiple join modes: to each retriever score. This param is not compatible with the `concatenate` join_mode. - `top_k_join`: Limit documents to top_k based on the resulting scores of the join. + +## RayPipeline Objects + +```python +class RayPipeline(Pipeline) +``` + +Ray (https://ray.io) is a framework for distributed computing. + +Ray allows distributing a Pipeline's components across a cluster of machines. The individual components of a +Pipeline can be independently scaled. For instance, an extractive QA Pipeline deployment can have three replicas +of the Reader and a single replica for the Retriever. It enables efficient resource utilization by horizontally +scaling Components. + +To set the number of replicas, add `replicas` in the YAML config for the node in a pipeline: + + ```yaml + | components: + | ... + | + | pipelines: + | - name: ray_query_pipeline + | type: RayPipeline + | nodes: + | - name: ESRetriever + | replicas: 2 # number of replicas to create on the Ray cluster + | inputs: [ Query ] + ``` + +A RayPipeline can only be created with a YAML Pipeline config. +>>> from haystack.pipeline import RayPipeline +>>> pipeline = RayPipeline.load_from_yaml(path="my_pipelines.yaml", pipeline_name="my_query_pipeline") +>>> pipeline.run(query="What is the capital of Germany?") + +By default, RayPipelines creates an instance of RayServe locally. To connect to an existing Ray instance, +set the `address` parameter when creating the RayPipeline instance. + + +#### \_\_init\_\_ + +```python + | __init__(address: str = None, **kwargs) +``` + +**Arguments**: + +- `address`: The IP address for the Ray cluster. If set to None, a local Ray instance is started. +- `kwargs`: Optional parameters for initializing Ray. + + +#### load\_from\_yaml + +```python + | @classmethod + | load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, address: Optional[str] = None, **kwargs, ,) +``` + +Load Pipeline from a YAML file defining the individual components and how they're tied together to form +a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```yaml + | version: '0.8' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: ElasticsearchRetriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | - name: MyReader + | inputs: [MyESRetriever] + ``` + +**Arguments**: + +- `path`: path of the YAML file. +- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example, + to change index name param for an ElasticsearchDocumentStore, an env + variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an + `_` sign must be used to specify nested hierarchical properties. +- `address`: The IP address for the Ray cluster. If set to None, a local Ray instance is started. + + +## \_RayDeploymentWrapper Objects + +```python +class _RayDeploymentWrapper() +``` + +Ray Serve supports calling of __init__ methods on the Classes to create "deployment" instances. + +In case of Haystack, some Components like Retrievers have complex init methods that needs objects +like Document Stores. + +This wrapper class encapsulates the initialization of Components. Given a Component Class +name, it creates an instance using the YAML Pipeline config. + + +#### \_\_init\_\_ + +```python + | __init__(pipeline_config: dict, component_name: str) +``` + +Create an instance of Component. + +**Arguments**: + +- `pipeline_config`: Pipeline YAML parsed as a dict. +- `component_name`: Component Class name. + + +#### \_\_call\_\_ + +```python + | __call__(*args, **kwargs) +``` + +Ray calls this method which is then re-directed to the corresponding component's run(). + + +## MostSimilarDocumentsPipeline Objects + +```python +class MostSimilarDocumentsPipeline(BaseStandardPipeline) +``` + + +#### \_\_init\_\_ + +```python + | __init__(document_store: BaseDocumentStore) +``` + +Initialize a Pipeline for finding the most similar documents to a given document. +This pipeline can be helpful if you already show a relevant document to your end users and they want to search for just similar ones. + +**Arguments**: + +- `document_store`: Document Store instance with already stored embeddings. + + +#### run + +```python + | run(document_ids: List[str], top_k: int = 5) +``` + +**Arguments**: + +- `document_ids`: document ids +- `top_k`: How many documents id to return against single document + diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md index 29f880e60..08dfb5191 100644 --- a/docs/_src/api/api/preprocessor.md +++ b/docs/_src/api/api/preprocessor.md @@ -12,7 +12,7 @@ class BasePreProcessor(BaseComponent) #### process ```python - | process(document: dict, clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict] + | process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True) -> List[dict] ``` Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. @@ -31,7 +31,7 @@ class PreProcessor(BasePreProcessor) #### \_\_init\_\_ ```python - | __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, split_by: str = "word", split_length: int = 1000, split_overlap: int = 0, split_respect_sentence_boundary: bool = True) + | __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, split_by: str = "word", split_length: int = 1000, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, language: str = "en") ``` **Arguments**: @@ -54,15 +54,16 @@ class PreProcessor(BasePreProcessor) - `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set to True, the individual split will always have complete sentences & the number of words will be <= split_length. +- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more. #### process ```python - | process(document: dict, clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict] + | process(documents: Union[dict, List[dict]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None) -> List[dict] ``` -Perform document cleaning and splitting. Takes a single document as input and returns a list of documents. +Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents. #### clean diff --git a/docs/_src/api/api/ranker.md b/docs/_src/api/api/ranker.md index 321c7d729..d3019a761 100644 --- a/docs/_src/api/api/ranker.md +++ b/docs/_src/api/api/ranker.md @@ -62,13 +62,25 @@ class FARMRanker(BaseRanker) ``` Transformer based model for Document Re-ranking using the TextPairClassifier of FARM framework (https://github.com/deepset-ai/FARM). +Re-Ranking can be used on top of a retriever to boost the performance for document search. This is particularly useful if the retriever has a high recall but is bad in sorting the documents by relevance. While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interface remains the same. +FARMRanker handles Cross-Encoder models that internally use two logits and output the classifier's probability of label "1" as similarity score. +This includes TextPairClassification models trained within FARM. +In contrast, SentenceTransformersRanker handles Cross-Encoder models that use a single logit as similarity score. +https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transformers | With a FARMRanker, you can: - - directly get predictions via predict() - fine-tune the model on TextPair data via train() +Usage example: +... +retriever = ElasticsearchRetriever(document_store=document_store) +ranker = FARMRanker(model_name_or_path="deepset/gbert-base-germandpr-reranking") +p = Pipeline() +p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) +p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"]) + #### \_\_init\_\_ @@ -108,7 +120,7 @@ Fine-tune a model on a TextPairClassification dataset. Options: **Arguments**: -- `data_dir`: Path to directory containing your training data in SQuAD style +- `data_dir`: Path to directory containing your training data - `train_filename`: Filename of training data - `dev_filename`: Filename of dev / eval data - `test_filename`: Filename of test data @@ -187,7 +199,7 @@ List of dictionaries containing query and ranked list of Document #### predict ```python - | predict(query: str, documents: List[Document], top_k: Optional[int] = None) + | predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> List[Document] ``` Use loaded ranker model to re-rank the supplied list of Document. diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md index a2bf1fc67..6361744f5 100644 --- a/docs/_src/api/api/reader.md +++ b/docs/_src/api/api/reader.md @@ -8,6 +8,15 @@ class BaseReader(BaseComponent) ``` + +#### run\_batch + +```python + | run_batch(query_doc_list: List[Dict], top_k: Optional[int] = None) +``` + +A unoptimized implementation of running Reader queries in batch + #### timing @@ -39,7 +48,7 @@ While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interf #### \_\_init\_\_ ```python - | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0) + | __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True) ``` **Arguments**: @@ -158,7 +167,7 @@ Saves the Reader model so that it can be reused at a later point in time. Use loaded QA model to find answers for a list of queries in each query's supplied list of Document. -Returns list of dictionaries containing answers sorted by (desc.) probability +Returns list of dictionaries containing answers sorted by (desc.) score **Arguments**: @@ -179,7 +188,7 @@ List of dictionaries containing query and answers Use loaded QA model to find answers for a query in the supplied list of Document. -Returns dictionaries containing answers sorted by (desc.) probability. +Returns dictionaries containing answers sorted by (desc.) score. Example: ```python |{ @@ -189,8 +198,7 @@ Example: | 'context': " She travels with her father, Eddard, to King's Landing when he is ", | 'offset_answer_start': 147, | 'offset_answer_end': 154, - | 'probability': 0.9787139466668613, - | 'score': None, + | 'score': 0.9787139466668613, | 'document_id': '1337' | },... | ] @@ -211,7 +219,7 @@ Dict containing query and answers #### eval\_on\_file ```python - | eval_on_file(data_dir: str, test_filename: str, device: str) + | eval_on_file(data_dir: str, test_filename: str, device: Optional[str] = None) ``` Performs evaluation on a SQuAD-formatted file. @@ -226,14 +234,14 @@ Returns a dict containing the following metrics: :type data_dir: Path or str - `test_filename`: The name of the file containing the test data in SQuAD format. :type test_filename: str -- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda". +- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default. :type device: str #### eval ```python - | eval(document_store: BaseDocumentStore, device: str, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", calibrate_conf_scores: bool = False) + | eval(document_store: BaseDocumentStore, device: Optional[str] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", calibrate_conf_scores: bool = False) ``` Performs evaluation on evaluation documents in the DocumentStore. @@ -245,7 +253,7 @@ Returns a dict containing the following metrics: **Arguments**: - `document_store`: DocumentStore containing the evaluation documents -- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda". +- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default. - `label_index`: Index/Table name where labeled questions are stored - `doc_index`: Index/Table name where documents that are used for evaluation are stored - `label_origin`: Field name where the gold labels are stored @@ -255,7 +263,7 @@ Returns a dict containing the following metrics: #### calibrate\_confidence\_scores ```python - | calibrate_confidence_scores(document_store: BaseDocumentStore, device: str, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label") + | calibrate_confidence_scores(document_store: BaseDocumentStore, device: Optional[str] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label") ``` Calibrates confidence scores on evaluation documents in the DocumentStore. @@ -263,7 +271,7 @@ Calibrates confidence scores on evaluation documents in the DocumentStore. **Arguments**: - `document_store`: DocumentStore containing the evaluation documents -- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda". +- `device`: The device on which the tensors should be processed. Choose from "cpu" and "cuda" or use the Reader's device by default. - `label_index`: Index/Table name where labeled questions are stored - `doc_index`: Index/Table name where documents that are used for evaluation are stored - `label_origin`: Field name where the gold labels are stored @@ -276,7 +284,7 @@ Calibrates confidence scores on evaluation documents in the DocumentStore. ``` Use loaded QA model to find answers for a question in the supplied list of Document. -Returns dictionaries containing answers sorted by (desc.) probability. +Returns dictionaries containing answers sorted by (desc.) score. Example: ```python |{ @@ -286,8 +294,7 @@ Example: | 'context': " She travels with her father, Eddard, to King's Landing when he is ", | 'offset_answer_start': 147, | 'offset_answer_end': 154, - | 'probability': 0.9787139466668613, - | 'score': None, + | 'score': 0.9787139466668613, | 'document_id': '1337' | },... | ] @@ -395,7 +402,7 @@ If you would like to set no_answer_boost, use a `FARMReader`. Use loaded QA model to find answers for a query in the supplied list of Document. -Returns dictionaries containing answers sorted by (desc.) probability. +Returns dictionaries containing answers sorted by (desc.) score. Example: ```python @@ -406,8 +413,7 @@ Example: | 'context': " She travels with her father, Eddard, to King's Landing when he is ", | 'offset_answer_start': 147, | 'offset_answer_end': 154, - | 'probability': 0.9787139466668613, - | 'score': None, + | 'score': 0.9787139466668613, | 'document_id': '1337' | },... | ] diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index 3b770ec82..a7bc7c517 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -238,7 +238,7 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que #### \_\_init\_\_ ```python - | __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", progress_bar: bool = True) + | __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[int, str, torch.device]]] = None) ``` Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -270,8 +270,8 @@ The checkpoint format matches huggingface transformers' model format - `max_seq_len_query`: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down." - `max_seq_len_passage`: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down." - `top_k`: How many documents to return per query. -- `use_gpu`: Whether to use gpu or not -- `batch_size`: Number of questions or passages to encode at once +- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. +- `batch_size`: Number of questions or passages to encode at once. In case of multiple gpus, this will be the total batch size. - `embed_title`: Whether to concatenate title and passage to a text pair that is then used to create the embedding. This is the approach used in the original paper and is likely to improve performance if your titles contain meaningful information for retrieval (topic, entities etc.) . @@ -283,8 +283,12 @@ The checkpoint format matches huggingface transformers' model format If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. - `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training. Options: `dot_product` (Default) or `cosine` +- `global_loss_buffer_size`: Buffer size for all_gather() in DDP. + Increase if errors like "encoded data exceeds max_size ..." come up - `progress_bar`: Whether to show a tqdm progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. +- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. ["cuda:0"]). + As multi-GPU training is currently not implemented for DPR, training will only use the first device provided in this list. #### retrieve diff --git a/docs/_src/api/api/summarizer.md b/docs/_src/api/api/summarizer.md index 6cbdcde3a..df77bbb09 100644 --- a/docs/_src/api/api/summarizer.md +++ b/docs/_src/api/api/summarizer.md @@ -110,7 +110,7 @@ https://huggingface.co/models?filter=summarization #### predict ```python - | predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document] + | predict(documents: List[Document], generate_single_summary: Optional[bool] = None, truncation: bool = True) -> List[Document] ``` Produce the summarization from the supplied documents. @@ -123,6 +123,7 @@ These document can for example be retrieved via the Retriever. If set to "True", all docs will be joined to a single string that will then be summarized. Important: The summary will depend on the order of the supplied documents! +- `truncation`: Truncate to a maximum length accepted by the model **Returns**: diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md index 4f67f176e..a2ff47cc8 100644 --- a/docs/_src/api/api/translator.md +++ b/docs/_src/api/api/translator.md @@ -15,7 +15,7 @@ Abstract class for a Translator component that translates either a query or a do ```python | @abstractmethod - | translate(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None, **kwargs) -> Union[str, List[Document], List[str], List[Dict[str, Any]]] + | translate(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[str], List[Dict[str, Any]]] ``` Translate the passed query or a list of documents from language A to B. @@ -24,7 +24,7 @@ Translate the passed query or a list of documents from language A to B. #### run ```python - | run(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None, **kwargs) + | run(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) ``` Method that gets executed when this class is used as a Node in a Haystack Pipeline @@ -89,7 +89,7 @@ They also have a few multilingual models that support multiple languages at once #### translate ```python - | translate(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None, **kwargs) -> Union[str, List[Document], List[str], List[Dict[str, Any]]] + | translate(query: Optional[str] = None, documents: Optional[Union[List[Document], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[str], List[Dict[str, Any]]] ``` Run the actual translation. You can supply a query or a list of documents. Whatever is supplied will be translated. diff --git a/docs/_src/tutorials/tutorials/1.md b/docs/_src/tutorials/tutorials/1.md index 339ee9a7a..37379adea 100644 --- a/docs/_src/tutorials/tutorials/1.md +++ b/docs/_src/tutorials/tutorials/1.md @@ -224,14 +224,16 @@ pipe = ExtractiveQAPipeline(reader, retriever) ```python # You can configure how many candidates the reader and retriever shall return -# The higher top_k_retriever, the better (but also the slower) your answers. -prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) +# The higher the top_k, the better (but also the slower) your answers. +prediction = pipe.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) ``` ```python -# prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) -# prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) +# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}}) +# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}}) ``` diff --git a/docs/_src/tutorials/tutorials/11.md b/docs/_src/tutorials/tutorials/11.md index ea979dd9c..b51fb46d6 100644 --- a/docs/_src/tutorials/tutorials/11.md +++ b/docs/_src/tutorials/tutorials/11.md @@ -112,7 +112,7 @@ from haystack.reader import FARMReader # Initialize DocumentStore and index documents launch_es() document_store = ElasticsearchDocumentStore() -document_store.delete_all_documents() +document_store.delete_documents() document_store.write_documents(got_dicts) # Initialize Sparse retriever @@ -138,8 +138,7 @@ from haystack.pipeline import ExtractiveQAPipeline p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=es_retriever) res = p_extractive_premade.run( query="Who is the father of Arya Stark?", - top_k_retriever=10, - top_k_reader=5 + params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}, ) print_answers(res, details="minimal") ``` @@ -153,7 +152,7 @@ from haystack.pipeline import DocumentSearchPipeline p_retrieval = DocumentSearchPipeline(es_retriever) res = p_retrieval.run( query="Who is the father of Arya Stark?", - top_k_retriever=10 + params={"Retriever": {"top_k": 10}}, ) print_documents(res, max_text_len=200) ``` @@ -177,7 +176,7 @@ rag_generator = RAGenerator() p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=dpr_retriever) res = p_generator.run( query="Who is the father of Arya Stark?", - top_k_retriever=10 + params={"Retriever": {"top_k": 10}} ) print_answers(res, details="minimal") @@ -223,8 +222,7 @@ p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"]) # Now we can run it res = p_extractive.run( query="Who is the father of Arya Stark?", - top_k_retriever=10, - top_k_reader=5 + params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} ) print_answers(res, details="minimal") p_extractive.draw("pipeline_extractive.png") @@ -255,7 +253,7 @@ p_ensemble.draw("pipeline_ensemble.png") # Run pipeline res = p_ensemble.run( query="Who is the father of Arya Stark?", - top_k_retriever=5 #This is top_k per retriever + params={"DPRRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}} ) print_answers(res, details="minimal") ``` @@ -266,8 +264,9 @@ Nodes are relatively simple objects and we encourage our users to design their own if they don't see on that fits their use case The only requirements are: -- Add a method run(self, **kwargs) to your class. **kwargs will contain the output from the previous node in your graph. -- Do whatever you want within run() (e.g. reformatting the query) +- Create a class that inherits `BaseComponent`. +- Add a method run() to your class. Add the mandatory and optional arguments it needs to process. These arguments must be passed as input to the pipeline, inside `params`, or output by preceding nodes. +- Add processing logic inside the run() (e.g. reformatting the query). - Return a tuple that contains your output data (for the next node) and the name of the outgoing edge (by default "output_1" for nodes that have one output) - Add a class attribute outgoing_edges = 1 that defines the number of output options from your node. You only need a higher number here if you have a decision node (see below). @@ -276,12 +275,16 @@ Here we have a template for a Node: ```python -class NodeTemplate(): +from haystack import BaseComponent +from typing import Optional + +class CustomNode(BaseComponent): outgoing_edges = 1 - def run(self, **kwargs): - # Insert code here to manipulate the variables in kwarg - return (kwargs, "output_1") + def run(self, query: str, my_optional_param: Optional[int]): + # process the inputs + output = {"my_output": ...} + return output, "output_1" ``` ## Decision Nodes @@ -300,14 +303,14 @@ Below, we define a very naive `QueryClassifier` and show how to use it: ```python -class QueryClassifier(): +class QueryClassifier(BaseComponent): outgoing_edges = 2 - def run(self, **kwargs): - if "?" in kwargs["query"]: - return (kwargs, "output_2") + def run(self, query: str): + if "?" in query: + return {}, "output_2" else: - return (kwargs, "output_1") + return {}, "output_1" # Here we build the pipeline p_classifier = Pipeline() @@ -318,18 +321,12 @@ p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", p_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query -res_1 = p_classifier.run( - query="Who is the father of Arya Stark?", - top_k_retriever=10 -) +res_1 = p_classifier.run(query="Who is the father of Arya Stark?") print("DPR Results" + "\n" + "="*15) print_answers(res_1) # Run only the sparse retriever on a keyword based query -res_2 = p_classifier.run( - query="Arya Stark father", - top_k_retriever=10 -) +res_2 = p_classifier.run(query="Arya Stark father") print("ES Results" + "\n" + "="*15) print_answers(res_2) ``` diff --git a/docs/_src/tutorials/tutorials/12.md b/docs/_src/tutorials/tutorials/12.md index 8cd0366ad..a85cc2e12 100644 --- a/docs/_src/tutorials/tutorials/12.md +++ b/docs/_src/tutorials/tutorials/12.md @@ -102,7 +102,7 @@ from haystack.pipeline import DocumentSearchPipeline p_retrieval = DocumentSearchPipeline(retriever) res = p_retrieval.run( query="Tell me something about Arya Stark?", - top_k_retriever=5 + params={"top_k": 5} ) print_documents(res, max_text_len=512) @@ -138,12 +138,15 @@ pipe = GenerativeQAPipeline(generator, retriever) ```python -pipe.run(query="Why did Arya Stark's character get portrayed in a television adaptation?", top_k_retriever=1) +pipe.run( + query="Why did Arya Stark's character get portrayed in a television adaptation?", + params={"Retriever": {"top_k": 1}} +) ``` ```python -pipe.run(query="What kind of character does Arya Stark play?", top_k_retriever=1) +pipe.run(query="What kind of character does Arya Stark play?", params={"Retriever": {"top_k": 1}}) ``` ## About us diff --git a/docs/_src/tutorials/tutorials/14.md b/docs/_src/tutorials/tutorials/14.md index 86907c26c..b7dfe82c4 100644 --- a/docs/_src/tutorials/tutorials/14.md +++ b/docs/_src/tutorials/tutorials/14.md @@ -118,7 +118,7 @@ got_dicts = convert_files_to_dicts( # Initialize DocumentStore and index documents launch_es() document_store = ElasticsearchDocumentStore() -document_store.delete_all_documents() +document_store.delete_documents() document_store.write_documents(got_dicts) # Initialize Sparse retriever @@ -162,16 +162,14 @@ sklearn_keyword_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query res_1 = sklearn_keyword_classifier.run( - query="Who is the father of Arya Stark?", - top_k_retriever=10 + query="Who is the father of Arya Stark?" ) print("DPR Results" + "\n" + "="*15) print_answers(res_1) # Run only the sparse retriever on a keyword based query res_2 = sklearn_keyword_classifier.run( - query="arya stark father", - top_k_retriever=10 + query="arya stark father" ) print("ES Results" + "\n" + "="*15) print_answers(res_2) @@ -183,16 +181,14 @@ print_answers(res_2) # Run only the dense retriever on the full sentence query res_3 = sklearn_keyword_classifier.run( - query="which country was jon snow filmed ?", - top_k_retriever=10 + query="which country was jon snow filmed ?" ) print("DPR Results" + "\n" + "="*15) print_answers(res_3) # Run only the sparse retriever on a keyword based query res_4 = sklearn_keyword_classifier.run( - query="jon snow country", - top_k_retriever=10 + query="jon snow country" ) print("ES Results" + "\n" + "="*15) print_answers(res_4) @@ -202,16 +198,14 @@ print_answers(res_4) ```python # Run only the dense retriever on the full sentence query res_5 = sklearn_keyword_classifier.run( - query="who are the younger brothers of arya stark ?", - top_k_retriever=10 + query="who are the younger brothers of arya stark ?" ) print("DPR Results" + "\n" + "="*15) print_answers(res_5) # Run only the sparse retriever on a keyword based query res_6 = sklearn_keyword_classifier.run( - query="arya stark younger brothers", - top_k_retriever=10 + query="arya stark younger brothers" ) print("ES Results" + "\n" + "="*15) print_answers(res_6) @@ -241,16 +235,14 @@ transformer_keyword_classifier.draw("pipeline_classifier.png") # Run only the dense retriever on the full sentence query res_1 = transformer_keyword_classifier.run( - query="Who is the father of Arya Stark?", - top_k_retriever=10 + query="Who is the father of Arya Stark?" ) print("DPR Results" + "\n" + "="*15) print_answers(res_1) # Run only the sparse retriever on a keyword based query res_2 = transformer_keyword_classifier.run( - query="arya stark father", - top_k_retriever=10 + query="arya stark father" ) print("ES Results" + "\n" + "="*15) print_answers(res_2) @@ -262,16 +254,14 @@ print_answers(res_2) # Run only the dense retriever on the full sentence query res_3 = transformer_keyword_classifier.run( - query="which country was jon snow filmed ?", - top_k_retriever=10 + query="which country was jon snow filmed ?" ) print("DPR Results" + "\n" + "="*15) print_answers(res_3) # Run only the sparse retriever on a keyword based query res_4 = transformer_keyword_classifier.run( - query="jon snow country", - top_k_retriever=10 + query="jon snow country" ) print("ES Results" + "\n" + "="*15) print_answers(res_4) @@ -281,16 +271,14 @@ print_answers(res_4) ```python # Run only the dense retriever on the full sentence query res_5 = transformer_keyword_classifier.run( - query="who are the younger brothers of arya stark ?", - top_k_retriever=10 + query="who are the younger brothers of arya stark ?" ) print("DPR Results" + "\n" + "="*15) print_answers(res_5) # Run only the sparse retriever on a keyword based query res_6 = transformer_keyword_classifier.run( - query="arya stark younger brothers", - top_k_retriever=10 + query="arya stark younger brothers" ) print("ES Results" + "\n" + "="*15) print_answers(res_6) @@ -318,16 +306,14 @@ transformer_question_classifier.draw("question_classifier.png") # Run only the QA reader on the question query res_1 = transformer_question_classifier.run( - query="Who is the father of Arya Stark?", - top_k_retriever=10 + query="Who is the father of Arya Stark?" ) print("DPR Results" + "\n" + "="*15) print_answers(res_1) # Show only DPR results res_2 = transformer_question_classifier.run( - query="Arya Stark was the daughter of a Lord.", - top_k_retriever=10 + query="Arya Stark was the daughter of a Lord." ) print("ES Results" + "\n" + "="*15) res_2 diff --git a/docs/_src/tutorials/tutorials/3.md b/docs/_src/tutorials/tutorials/3.md index 56513096b..eaaff5d60 100644 --- a/docs/_src/tutorials/tutorials/3.md +++ b/docs/_src/tutorials/tutorials/3.md @@ -169,14 +169,16 @@ pipe = ExtractiveQAPipeline(reader, retriever) ```python # You can configure how many candidates the reader and retriever shall return -# The higher top_k_retriever, the better (but also the slower) your answers. -prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10, top_k_reader=5) +# The higher top_k for retriever, the better (but also the slower) your answers. +prediction = pipe.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) ``` ```python -# prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_reader=5) -# prediction = pipe.run(query="Who is the sister of Sansa?", top_k_reader=5) +# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}}) +# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}}) ``` diff --git a/docs/_src/tutorials/tutorials/4.md b/docs/_src/tutorials/tutorials/4.md index cf806ba92..9d182e91b 100644 --- a/docs/_src/tutorials/tutorials/4.md +++ b/docs/_src/tutorials/tutorials/4.md @@ -103,7 +103,7 @@ from haystack.document_store.elasticsearch import ElasticsearchDocumentStore document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document", embedding_field="question_emb", - embedding_dim=768, + embedding_dim=384, excluded_meta_data=["question_emb"]) ``` @@ -113,7 +113,7 @@ We can use the `EmbeddingRetriever` for this purpose and specify a model that we ```python -retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/sentence_bert", use_gpu=True) +retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2", use_gpu=True) ``` ### Prepare & Index FAQ data @@ -154,7 +154,7 @@ pipe = FAQPipeline(retriever=retriever) ```python -prediction = pipe.run(query="How is the virus spreading?", top_k_retriever=10) +prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}}) print_answers(prediction, details="all") ``` diff --git a/docs/_src/tutorials/tutorials/5.md b/docs/_src/tutorials/tutorials/5.md index 284c623ef..9fd1e9900 100644 --- a/docs/_src/tutorials/tutorials/5.md +++ b/docs/_src/tutorials/tutorials/5.md @@ -219,10 +219,8 @@ results = [] for l in labels: res = p.run( query=l.question, - top_k_retriever=10, labels=l, - top_k_reader=10, - index=doc_index, + params={"index": doc_index, "Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}, ) results.append(res) ``` diff --git a/docs/_src/tutorials/tutorials/6.md b/docs/_src/tutorials/tutorials/6.md index 6645255ae..0b3a7738a 100644 --- a/docs/_src/tutorials/tutorials/6.md +++ b/docs/_src/tutorials/tutorials/6.md @@ -213,8 +213,10 @@ pipe = ExtractiveQAPipeline(reader, retriever) ```python # You can configure how many candidates the reader and retriever shall return -# The higher top_k_retriever, the better (but also the slower) your answers. -prediction = pipe.run(query="Who created the Dothraki vocabulary?", top_k_retriever=10, top_k_reader=5) +# The higher top_k for retriever, the better (but also the slower) your answers. +prediction = pipe.run( + query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) ``` diff --git a/docs/_src/tutorials/tutorials/7.md b/docs/_src/tutorials/tutorials/7.md index fd7fb4376..936a2b99d 100644 --- a/docs/_src/tutorials/tutorials/7.md +++ b/docs/_src/tutorials/tutorials/7.md @@ -128,7 +128,7 @@ The `update_embeddings()` method uses the retriever to create an embedding for e ```python # Delete existing documents in documents store -document_store.delete_all_documents() +document_store.delete_documents() # Write documents to document store document_store.write_documents(documents) @@ -195,7 +195,7 @@ from haystack.pipeline import GenerativeQAPipeline pipe = GenerativeQAPipeline(generator=generator, retriever=retriever) for question in QUESTIONS: - res = pipe.run(query=question, top_k_generator=1, top_k_retriever=5) + res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}}) print(res) ``` diff --git a/haystack/connector/crawler.py b/haystack/connector/crawler.py index 342bbca56..adaf3533c 100644 --- a/haystack/connector/crawler.py +++ b/haystack/connector/crawler.py @@ -33,11 +33,11 @@ class Crawler(BaseComponent): :param output_dir: Path for the directory to store files :param urls: List of http(s) address(es) (can also be supplied later when calling crawl()) - :param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options: - 0: Only initial list of urls - 1: Follow links found on the initial URLs (but no further) - :param filter_urls: Optional list of regular expressions that the crawled URLs must comply with. - All URLs not matching at least one of the regular expressions will be dropped. + :param crawler_depth: How many sublinks to follow from the initial list of URLs. Current options: + 0: Only initial list of urls + 1: Follow links found on the initial URLs (but no further) + :param filter_urls: Optional list of regular expressions that the crawled URLs must comply with. + All URLs not matching at least one of the regular expressions will be dropped. :param overwrite_existing_files: Whether to overwrite existing files in output_dir with new content """ IN_COLAB = "google.colab" in sys.modules