From 81f82b1b95e9808b7f48c4384e43d35829664d79 Mon Sep 17 00:00:00 2001
From: Branden Chan <33759007+brandenchan@users.noreply.github.com>
Date: Thu, 11 Nov 2021 12:44:29 +0100
Subject: [PATCH] Update API Reference Pages for v1.0 (#1729)
* Create new API pages and update existing ones
* Create query classifier page
* Remove Objects suffix
---
docs/_src/api/api/crawler.md | 116 ++++++---
docs/_src/api/api/document_store.md | 22 +-
docs/_src/api/api/extractor.md | 53 ++++
docs/_src/api/api/file_converter.md | 2 +-
docs/_src/api/api/generate_docstrings.sh | 2 +
docs/_src/api/api/generator.md | 6 +-
docs/_src/api/api/other.md | 47 ++++
docs/_src/api/api/preprocessor.md | 4 +-
docs/_src/api/api/primitives.md | 232 ++++++++++++++++++
.../api/pydoc-markdown-answer-generator.yml | 2 +-
docs/_src/api/api/pydoc-markdown-crawler.yml | 2 +-
.../pydoc-markdown-document-classifier.yml | 2 +-
.../api/api/pydoc-markdown-document-store.yml | 2 +-
.../api/api/pydoc-markdown-evaluation.yml | 2 +-
.../_src/api/api/pydoc-markdown-extractor.yml | 4 +-
.../api/pydoc-markdown-file-classifier.yml | 2 +-
.../api/pydoc-markdown-file-converters.yml | 2 +-
docs/_src/api/api/pydoc-markdown-other.yml | 4 +-
.../_src/api/api/pydoc-markdown-pipelines.yml | 2 +-
.../api/api/pydoc-markdown-preprocessor.yml | 2 +-
.../api/api/pydoc-markdown-primitives.yml | 18 ++
.../api/pydoc-markdown-query-classifier.yml | 18 ++
.../api/pydoc-markdown-question-generator.yml | 2 +-
docs/_src/api/api/pydoc-markdown-ranker.yml | 2 +-
docs/_src/api/api/pydoc-markdown-reader.yml | 2 +-
.../_src/api/api/pydoc-markdown-retriever.yml | 2 +-
.../api/api/pydoc-markdown-summarizer.yml | 2 +-
.../api/api/pydoc-markdown-translator.yml | 2 +-
docs/_src/api/api/query_classifier.md | 143 +++++++++++
docs/_src/api/api/reader.md | 8 +-
docs/_src/api/api/retriever.md | 18 +-
docs/_src/api/api/summarizer.md | 4 +-
docs/_src/api/api/translator.md | 4 +-
33 files changed, 645 insertions(+), 90 deletions(-)
create mode 100644 docs/_src/api/api/extractor.md
create mode 100644 docs/_src/api/api/other.md
create mode 100644 docs/_src/api/api/primitives.md
create mode 100644 docs/_src/api/api/pydoc-markdown-primitives.yml
create mode 100644 docs/_src/api/api/pydoc-markdown-query-classifier.yml
create mode 100644 docs/_src/api/api/query_classifier.md
diff --git a/docs/_src/api/api/crawler.md b/docs/_src/api/api/crawler.md
index 66f3452d3..c3d0db4a9 100644
--- a/docs/_src/api/api/crawler.md
+++ b/docs/_src/api/api/crawler.md
@@ -1,53 +1,95 @@
-
-# Module entity
+
+# Module crawler
-
-## EntityExtractor Objects
+
+## Crawler
```python
-class EntityExtractor(BaseComponent)
+class Crawler(BaseComponent)
```
-This node is used to extract entities out of documents.
-The most common use case for this would be as a named entity extractor.
-The default model used is dslim/bert-base-NER.
-This node can be placed in a querying pipeline to perform entity extraction on retrieved documents only,
-or it can be placed in an indexing pipeline so that all documents in the document store have extracted entities.
-The entities extracted by this Node will populate Document.entities
+Crawl texts from a website so that we can use them later in Haystack as a corpus for search / question answering etc.
-
+**Example:**
+```python
+| from haystack.nodes.connector import Crawler
+|
+| crawler = Crawler(output_dir="crawled_files")
+| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/
+| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"],
+| filter_urls= ["haystack\.deepset\.ai\/overview\/"])
+```
+
+
+#### \_\_init\_\_
+
+```python
+ | __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True)
+```
+
+Init object with basic params for crawling (can be overwritten later).
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http(s) address(es) (can also be supplied later when calling crawl())
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+ 0: Only initial list of urls
+ 1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+ All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+
+
+#### crawl
+
+```python
+ | crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None) -> List[Path]
+```
+
+Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON
+file per URL, including text and basic meta data).
+You can optionally specify via `filter_urls` to only crawl URLs that match a certain pattern.
+All parameters are optional here and only meant to overwrite instance attributes at runtime.
+If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used.
+
+**Arguments**:
+
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http addresses or single http address
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+ 0: Only initial list of urls
+ 1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+ All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+
+**Returns**:
+
+List of paths where the crawled webpages got stored
+
+
#### run
```python
- | run(documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str]
+ | run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False) -> Tuple[Dict, str]
```
-This is the method called when this node is used in a pipeline
+Method to be executed when the Crawler is used as a Node within a Haystack pipeline.
-
-#### extract
+**Arguments**:
-```python
- | extract(text)
-```
+- `output_dir`: Path for the directory to store files
+- `urls`: List of http addresses or single http address
+- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options:
+ 0: Only initial list of urls
+ 1: Follow links found on the initial URLs (but no further)
+- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with.
+ All URLs not matching at least one of the regular expressions will be dropped.
+- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content
+- `return_documents`: Return json files content
-This function can be called to perform entity extraction when using the node in isolation.
+**Returns**:
-
-#### simplify\_ner\_for\_qa
-
-```python
-simplify_ner_for_qa(output)
-```
-
-Returns a simplified version of the output dictionary
-with the following structure:
-[
- {
- answer: { ... }
- entities: [ { ... }, {} ]
- }
-]
-The entities included are only the ones that overlap with
-the answer itself.
+Tuple({"paths": List of filepaths, ...}, Name of output edge)
diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md
index 53fb9b3fc..3db306c5b 100644
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@@ -2,7 +2,7 @@
# Module base
-## BaseKnowledgeGraph Objects
+## BaseKnowledgeGraph
```python
class BaseKnowledgeGraph(BaseComponent)
@@ -11,7 +11,7 @@ class BaseKnowledgeGraph(BaseComponent)
Base class for implementing Knowledge Graphs.
-## BaseDocumentStore Objects
+## BaseDocumentStore
```python
class BaseDocumentStore(BaseComponent)
@@ -150,7 +150,7 @@ Batch elements of an iterable into fixed-length chunks or blocks.
# Module elasticsearch
-## ElasticsearchDocumentStore Objects
+## ElasticsearchDocumentStore
```python
class ElasticsearchDocumentStore(BaseDocumentStore)
@@ -530,7 +530,7 @@ Delete labels in an index. All labels are deleted if no filters are passed.
None
-## OpenSearchDocumentStore Objects
+## OpenSearchDocumentStore
```python
class OpenSearchDocumentStore(ElasticsearchDocumentStore)
@@ -564,7 +564,7 @@ Find the document that is most similar to the provided `query_emb` by using a ve
-## OpenDistroElasticsearchDocumentStore Objects
+## OpenDistroElasticsearchDocumentStore
```python
class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore)
@@ -576,7 +576,7 @@ A DocumentStore which has an Open Distro for Elasticsearch service behind it.
# Module memory
-## InMemoryDocumentStore Objects
+## InMemoryDocumentStore
```python
class InMemoryDocumentStore(BaseDocumentStore)
@@ -857,7 +857,7 @@ None
# Module sql
-## SQLDocumentStore Objects
+## SQLDocumentStore
```python
class SQLDocumentStore(BaseDocumentStore)
@@ -1099,7 +1099,7 @@ None
# Module faiss
-## FAISSDocumentStore Objects
+## FAISSDocumentStore
```python
class FAISSDocumentStore(SQLDocumentStore)
@@ -1368,7 +1368,7 @@ Note: In order to have a correct mapping from FAISS to SQL,
# Module milvus
-## MilvusDocumentStore Objects
+## MilvusDocumentStore
```python
class MilvusDocumentStore(SQLDocumentStore)
@@ -1660,7 +1660,7 @@ Return the count of embeddings in the document store.
# Module weaviate
-## WeaviateDocumentStore Objects
+## WeaviateDocumentStore
```python
class WeaviateDocumentStore(BaseDocumentStore)
@@ -1947,7 +1947,7 @@ None
# Module graphdb
-## GraphDBKnowledgeGraph Objects
+## GraphDBKnowledgeGraph
```python
class GraphDBKnowledgeGraph(BaseKnowledgeGraph)
diff --git a/docs/_src/api/api/extractor.md b/docs/_src/api/api/extractor.md
new file mode 100644
index 000000000..5e443c6d6
--- /dev/null
+++ b/docs/_src/api/api/extractor.md
@@ -0,0 +1,53 @@
+
+# Module entity
+
+
+## EntityExtractor
+
+```python
+class EntityExtractor(BaseComponent)
+```
+
+This node is used to extract entities out of documents.
+The most common use case for this would be as a named entity extractor.
+The default model used is dslim/bert-base-NER.
+This node can be placed in a querying pipeline to perform entity extraction on retrieved documents only,
+or it can be placed in an indexing pipeline so that all documents in the document store have extracted entities.
+The entities extracted by this Node will populate Document.entities
+
+
+#### run
+
+```python
+ | run(documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str]
+```
+
+This is the method called when this node is used in a pipeline
+
+
+#### extract
+
+```python
+ | extract(text)
+```
+
+This function can be called to perform entity extraction when using the node in isolation.
+
+
+#### simplify\_ner\_for\_qa
+
+```python
+simplify_ner_for_qa(output)
+```
+
+Returns a simplified version of the output dictionary
+with the following structure:
+[
+ {
+ answer: { ... }
+ entities: [ { ... }, {} ]
+ }
+]
+The entities included are only the ones that overlap with
+the answer itself.
+
diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md
index a70d31c3b..5df5b8ed7 100644
--- a/docs/_src/api/api/file_converter.md
+++ b/docs/_src/api/api/file_converter.md
@@ -2,7 +2,7 @@
# Module file\_type
-## FileTypeClassifier Objects
+## FileTypeClassifier
```python
class FileTypeClassifier(BaseComponent)
diff --git a/docs/_src/api/api/generate_docstrings.sh b/docs/_src/api/api/generate_docstrings.sh
index 54b2ec0c2..de013a7ea 100755
--- a/docs/_src/api/api/generate_docstrings.sh
+++ b/docs/_src/api/api/generate_docstrings.sh
@@ -2,6 +2,7 @@
# Purpose : Automate the generation of docstrings
+pydoc-markdown pydoc-markdown-primitives.yml
pydoc-markdown pydoc-markdown-document-store.yml
pydoc-markdown pydoc-markdown-file-converters.yml
pydoc-markdown pydoc-markdown-file-classifier.yml
@@ -18,5 +19,6 @@ pydoc-markdown pydoc-markdown-pipelines.yml
pydoc-markdown pydoc-markdown-evaluation.yml
pydoc-markdown pydoc-markdown-ranker.yml
pydoc-markdown pydoc-markdown-question-generator.yml
+pydoc-markdown pydoc-markdown-query-classifier.yml
pydoc-markdown pydoc-markdown-document-classifier.yml
diff --git a/docs/_src/api/api/generator.md b/docs/_src/api/api/generator.md
index 64b1924b5..f19eec743 100644
--- a/docs/_src/api/api/generator.md
+++ b/docs/_src/api/api/generator.md
@@ -2,7 +2,7 @@
# Module base
-## BaseGenerator Objects
+## BaseGenerator
```python
class BaseGenerator(BaseComponent)
@@ -34,7 +34,7 @@ Generated answers plus additional infos in a dict
# Module transformers
-## RAGenerator Objects
+## RAGenerator
```python
class RAGenerator(BaseGenerator)
@@ -140,7 +140,7 @@ Generated answers plus additional infos in a dict like this:
```
-## Seq2SeqGenerator Objects
+## Seq2SeqGenerator
```python
class Seq2SeqGenerator(BaseGenerator)
diff --git a/docs/_src/api/api/other.md b/docs/_src/api/api/other.md
new file mode 100644
index 000000000..a55694762
--- /dev/null
+++ b/docs/_src/api/api/other.md
@@ -0,0 +1,47 @@
+
+# Module docs2answers
+
+
+## Docs2Answers
+
+```python
+class Docs2Answers(BaseComponent)
+```
+
+This Node is used to convert retrieved documents into predicted answers format.
+It is useful for situations where you are calling a Retriever only pipeline via REST API.
+This ensures that your output is in a compatible format.
+
+
+# Module join\_docs
+
+
+## JoinDocuments
+
+```python
+class JoinDocuments(BaseComponent)
+```
+
+A node to join documents outputted by multiple retriever nodes.
+
+The node allows multiple join modes:
+* concatenate: combine the documents from multiple nodes. Any duplicate documents are discarded.
+* merge: merge scores of documents from multiple nodes. Optionally, each input score can be given a different
+ `weight` & a `top_k` limit can be set. This mode can also be used for "reranking" retrieved documents.
+
+
+#### \_\_init\_\_
+
+```python
+ | __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None)
+```
+
+**Arguments**:
+
+- `join_mode`: `concatenate` to combine documents from multiple retrievers or `merge` to aggregate scores of
+ individual documents.
+- `weights`: A node-wise list(length of list must be equal to the number of input nodes) of weights for
+ adjusting document scores when using the `merge` join_mode. By default, equal weight is given
+ to each retriever score. This param is not compatible with the `concatenate` join_mode.
+- `top_k_join`: Limit documents to top_k based on the resulting scores of the join.
+
diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md
index 2ac35ef7f..30c17182b 100644
--- a/docs/_src/api/api/preprocessor.md
+++ b/docs/_src/api/api/preprocessor.md
@@ -2,7 +2,7 @@
# Module base
-## BasePreProcessor Objects
+## BasePreProcessor
```python
class BasePreProcessor(BaseComponent)
@@ -21,7 +21,7 @@ Perform document cleaning and splitting. Takes a single document as input and re
# Module preprocessor
-## PreProcessor Objects
+## PreProcessor
```python
class PreProcessor(BasePreProcessor)
diff --git a/docs/_src/api/api/primitives.md b/docs/_src/api/api/primitives.md
new file mode 100644
index 000000000..294d75afa
--- /dev/null
+++ b/docs/_src/api/api/primitives.md
@@ -0,0 +1,232 @@
+
+# Module schema
+
+
+## Document
+
+```python
+@dataclass
+class Document()
+```
+
+
+#### \_\_init\_\_
+
+```python
+ | __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None)
+```
+
+One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
+Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
+many other places that manipulate or interact with document-level data.
+
+Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
+into smaller passages. We'll have one Document per passage in this case.
+
+Each document has a unique ID. This can be supplied by the user or generated automatically.
+It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
+
+There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
+
+**Arguments**:
+
+- `content`: Content of the document. For most cases, this will be text, but it can be a table or image.
+- `content_type`: One of "image", "table" or "image". Haystack components can use this to adjust their
+ handling of Documents and check compatibility.
+- `id`: Unique ID for the document. If not supplied by the user, we'll generate one automatically by
+ creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`.
+- `score`: The relevance score of the Document determined by a model (e.g. Retriever or Re-Ranker).
+ In the range of [0,1], where 1 means extremely relevant.
+- `meta`: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed).
+- `embedding`: Vector encoding of the text
+- `id_hash_keys`: Generate the document id from a custom list of strings.
+ If you want ensure you don't have duplicate documents in your DocumentStore but texts are
+ not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"].
+
+
+#### to\_dict
+
+```python
+ | to_dict(field_map={}) -> Dict
+```
+
+Convert Document to dict. An optional field_map can be supplied to change the names of the keys in the
+resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
+they are serialized / stored in other places (e.g. elasticsearch)
+Example:
+| doc = Document(content="some text", content_type="text")
+| doc.to_dict(field_map={"custom_content_field": "content"})
+| >>> {"custom_content_field": "some text", content_type": "text"}
+
+**Arguments**:
+
+- `field_map`: Dict with keys being the custom target keys and values being the standard Document attributes
+
+**Returns**:
+
+dict with content of the Document
+
+
+#### from\_dict
+
+```python
+ | @classmethod
+ | from_dict(cls, dict, field_map={})
+```
+
+Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the
+input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that
+they are serialized / stored in other places (e.g. elasticsearch)
+Example:
+| my_dict = {"custom_content_field": "some text", content_type": "text"}
+| Document.from_dict(my_dict, field_map={"custom_content_field": "content"})
+
+**Arguments**:
+
+- `field_map`: Dict with keys being the custom target keys and values being the standard Document attributes
+
+**Returns**:
+
+dict with content of the Document
+
+
+#### \_\_lt\_\_
+
+```python
+ | __lt__(other)
+```
+
+Enable sorting of Documents by score
+
+
+## Span
+
+```python
+@dataclass
+class Span()
+```
+
+
+#### end
+
+Defining a sequence of characters (Text span) or cells (Table span) via start and end index.
+For extractive QA: Character where answer starts/ends
+For TableQA: Cell where the answer starts/ends (counted from top left to bottom right of table)
+
+**Arguments**:
+
+- `start`: Position where the span starts
+- `end`: Position where the spand ends
+
+
+## Answer
+
+```python
+@dataclass
+class Answer()
+```
+
+
+#### meta
+
+The fundamental object in Haystack to represent any type of Answers (e.g. extractive QA, generative QA or TableQA).
+For example, it's used within some Nodes like the Reader, but also in the REST API.
+
+**Arguments**:
+
+- `answer`: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string.
+- `type`: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model
+ (i.e. we can locate an exact answer string in one of the documents) or from a generative model
+ (i.e. no pointer to a specific document, no offsets ...).
+- `score`: The relevance score of the Answer determined by a model (e.g. Reader or Generator).
+ In the range of [0,1], where 1 means extremely relevant.
+- `context`: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...)
+- `offsets_in_document`: List of `Span` objects with start and end positions of the answer **in the
+ document** (as stored in the document store).
+ For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
+ For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
+ (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
+- `offsets_in_context`: List of `Span` objects with start and end positions of the answer **in the
+ context** (i.e. the surrounding text/table of a certain window size).
+ For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start
+ For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start
+ (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here)
+- `document_id`: ID of the document that the answer was located it (if any)
+- `meta`: Dict that can be used to associate any kind of custom meta data with the answer.
+ In extractive QA, this will carry the meta data of the document where the answer was found.
+
+
+#### \_\_lt\_\_
+
+```python
+ | __lt__(other)
+```
+
+Enable sorting of Answers by score
+
+
+## Label
+
+```python
+@dataclass
+class Label()
+```
+
+
+#### \_\_init\_\_
+
+```python
+ | __init__(query: str, document: Document, is_correct_answer: bool, is_correct_document: bool, origin: Literal["user-feedback", "gold-label"], answer: Optional[Answer], id: Optional[str] = None, no_answer: Optional[bool] = None, pipeline_id: Optional[str] = None, created_at: Optional[str] = None, updated_at: Optional[str] = None, meta: Optional[dict] = None)
+```
+
+Object used to represent label/feedback in a standardized way within Haystack.
+This includes labels from dataset like SQuAD, annotations from labeling tools,
+or, user-feedback from the Haystack REST API.
+
+**Arguments**:
+
+- `query`: the question (or query) for finding answers.
+- `document`:
+- `answer`: the answer object.
+- `is_correct_answer`: whether the sample is positive or negative.
+- `is_correct_document`: in case of negative sample(is_correct_answer is False), there could be two cases;
+ incorrect answer but correct document & incorrect document. This flag denotes if
+ the returned document was correct.
+- `origin`: the source for the labels. It can be used to later for filtering.
+- `id`: Unique ID used within the DocumentStore. If not supplied, a uuid will be generated automatically.
+- `no_answer`: whether the question in unanswerable.
+- `pipeline_id`: pipeline identifier (any str) that was involved for generating this label (in-case of user feedback).
+- `created_at`: Timestamp of creation with format yyyy-MM-dd HH:mm:ss.
+ Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S").
+- `created_at`: Timestamp of update with format yyyy-MM-dd HH:mm:ss.
+ Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S")
+- `meta`: Meta fields like "annotator_name" in the form of a custom dict (any keys and values allowed).
+
+
+## MultiLabel
+
+```python
+@dataclass
+class MultiLabel()
+```
+
+
+#### \_\_init\_\_
+
+```python
+ | __init__(labels: List[Label], drop_negative_labels=False, drop_no_answers=False)
+```
+
+There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated
+answers for one question or multiple documents contain the information you want for a query.
+This class is "syntactic sugar" that simplifies the work with such a list of related Labels.
+It stored the original labels in MultiLabel.labels and provides additional aggregated attributes that are
+automatically created at init time. For example, MultiLabel.no_answer allows you to easily access if any of the
+underlying Labels provided a text answer and therefore demonstrates that there is indeed a possible answer.
+
+**Arguments**:
+
+- `labels`: A list lof labels that belong to a similar query and shall be "grouped" together
+- `drop_negative_labels`: Whether to drop negative labels from that group (e.g. thumbs down feedback from UI)
+- `drop_no_answers`: Whether to drop labels that specify the answer is impossible
+
diff --git a/docs/_src/api/api/pydoc-markdown-answer-generator.yml b/docs/_src/api/api/pydoc-markdown-answer-generator.yml
index 623dbd309..a43a0238c 100644
--- a/docs/_src/api/api/pydoc-markdown-answer-generator.yml
+++ b/docs/_src/api/api/pydoc-markdown-answer-generator.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-crawler.yml b/docs/_src/api/api/pydoc-markdown-crawler.yml
index 584010c7e..02c3be3ae 100644
--- a/docs/_src/api/api/pydoc-markdown-crawler.yml
+++ b/docs/_src/api/api/pydoc-markdown-crawler.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-document-classifier.yml b/docs/_src/api/api/pydoc-markdown-document-classifier.yml
index b808a0f68..33f880eeb 100644
--- a/docs/_src/api/api/pydoc-markdown-document-classifier.yml
+++ b/docs/_src/api/api/pydoc-markdown-document-classifier.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-document-store.yml b/docs/_src/api/api/pydoc-markdown-document-store.yml
index 568e6d8a8..b15d81cb6 100644
--- a/docs/_src/api/api/pydoc-markdown-document-store.yml
+++ b/docs/_src/api/api/pydoc-markdown-document-store.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-evaluation.yml b/docs/_src/api/api/pydoc-markdown-evaluation.yml
index 227027338..0b19d3cd4 100644
--- a/docs/_src/api/api/pydoc-markdown-evaluation.yml
+++ b/docs/_src/api/api/pydoc-markdown-evaluation.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-extractor.yml b/docs/_src/api/api/pydoc-markdown-extractor.yml
index 7730a5ebe..7f563ae19 100644
--- a/docs/_src/api/api/pydoc-markdown-extractor.yml
+++ b/docs/_src/api/api/pydoc-markdown-extractor.yml
@@ -11,8 +11,8 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
- filename: crawler.md
+ filename: extractor.md
diff --git a/docs/_src/api/api/pydoc-markdown-file-classifier.yml b/docs/_src/api/api/pydoc-markdown-file-classifier.yml
index f1012c9ea..3f3a6278c 100644
--- a/docs/_src/api/api/pydoc-markdown-file-classifier.yml
+++ b/docs/_src/api/api/pydoc-markdown-file-classifier.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-file-converters.yml b/docs/_src/api/api/pydoc-markdown-file-converters.yml
index ab3f36c27..831c23536 100644
--- a/docs/_src/api/api/pydoc-markdown-file-converters.yml
+++ b/docs/_src/api/api/pydoc-markdown-file-converters.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-other.yml b/docs/_src/api/api/pydoc-markdown-other.yml
index 00c094806..a2065a708 100644
--- a/docs/_src/api/api/pydoc-markdown-other.yml
+++ b/docs/_src/api/api/pydoc-markdown-other.yml
@@ -11,8 +11,8 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
- filename: ranker.md
+ filename: other.md
diff --git a/docs/_src/api/api/pydoc-markdown-pipelines.yml b/docs/_src/api/api/pydoc-markdown-pipelines.yml
index ace599894..aa8272680 100644
--- a/docs/_src/api/api/pydoc-markdown-pipelines.yml
+++ b/docs/_src/api/api/pydoc-markdown-pipelines.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-preprocessor.yml b/docs/_src/api/api/pydoc-markdown-preprocessor.yml
index 2d698004b..677bbcbc2 100644
--- a/docs/_src/api/api/pydoc-markdown-preprocessor.yml
+++ b/docs/_src/api/api/pydoc-markdown-preprocessor.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-primitives.yml b/docs/_src/api/api/pydoc-markdown-primitives.yml
new file mode 100644
index 000000000..87a02fab3
--- /dev/null
+++ b/docs/_src/api/api/pydoc-markdown-primitives.yml
@@ -0,0 +1,18 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/]
+ modules: ['schema']
+ ignore_when_discovered: ['__init__']
+processor:
+ - type: filter
+ expression: not name.startswith('_') and default()
+ - documented_only: true
+ - do_not_filter_modules: false
+ - skip_empty_modules: true
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: false
+ add_member_class_prefix: false
+ filename: primitives.md
diff --git a/docs/_src/api/api/pydoc-markdown-query-classifier.yml b/docs/_src/api/api/pydoc-markdown-query-classifier.yml
new file mode 100644
index 000000000..85a761dd1
--- /dev/null
+++ b/docs/_src/api/api/pydoc-markdown-query-classifier.yml
@@ -0,0 +1,18 @@
+loaders:
+ - type: python
+ search_path: [../../../../haystack/nodes/query_classifier]
+ modules: ['base', 'sklearn', 'transformers']
+ ignore_when_discovered: ['__init__']
+processor:
+ - type: filter
+ expression: not name.startswith('_') and default()
+ - documented_only: true
+ - do_not_filter_modules: false
+ - skip_empty_modules: true
+renderer:
+ type: markdown
+ descriptive_class_title: false
+ descriptive_module_title: true
+ add_method_class_prefix: false
+ add_member_class_prefix: false
+ filename: query_classifier.md
diff --git a/docs/_src/api/api/pydoc-markdown-question-generator.yml b/docs/_src/api/api/pydoc-markdown-question-generator.yml
index b09453450..671313a03 100644
--- a/docs/_src/api/api/pydoc-markdown-question-generator.yml
+++ b/docs/_src/api/api/pydoc-markdown-question-generator.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-ranker.yml b/docs/_src/api/api/pydoc-markdown-ranker.yml
index 6cce982ef..f1a55dddc 100644
--- a/docs/_src/api/api/pydoc-markdown-ranker.yml
+++ b/docs/_src/api/api/pydoc-markdown-ranker.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-reader.yml b/docs/_src/api/api/pydoc-markdown-reader.yml
index 9b9bd3c28..ba2fe47da 100644
--- a/docs/_src/api/api/pydoc-markdown-reader.yml
+++ b/docs/_src/api/api/pydoc-markdown-reader.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-retriever.yml b/docs/_src/api/api/pydoc-markdown-retriever.yml
index b6a6c7870..b0daf5326 100644
--- a/docs/_src/api/api/pydoc-markdown-retriever.yml
+++ b/docs/_src/api/api/pydoc-markdown-retriever.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-summarizer.yml b/docs/_src/api/api/pydoc-markdown-summarizer.yml
index c90744e31..09cee2927 100644
--- a/docs/_src/api/api/pydoc-markdown-summarizer.yml
+++ b/docs/_src/api/api/pydoc-markdown-summarizer.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/pydoc-markdown-translator.yml b/docs/_src/api/api/pydoc-markdown-translator.yml
index f25a7ead9..0bcf3dd65 100644
--- a/docs/_src/api/api/pydoc-markdown-translator.yml
+++ b/docs/_src/api/api/pydoc-markdown-translator.yml
@@ -11,7 +11,7 @@ processor:
- skip_empty_modules: true
renderer:
type: markdown
- descriptive_class_title: true
+ descriptive_class_title: false
descriptive_module_title: true
add_method_class_prefix: false
add_member_class_prefix: false
diff --git a/docs/_src/api/api/query_classifier.md b/docs/_src/api/api/query_classifier.md
new file mode 100644
index 000000000..d1ce77464
--- /dev/null
+++ b/docs/_src/api/api/query_classifier.md
@@ -0,0 +1,143 @@
+
+# Module base
+
+
+## BaseQueryClassifier Objects
+
+```python
+class BaseQueryClassifier(BaseComponent)
+```
+
+Abstract class for Query Classifiers
+
+
+# Module sklearn
+
+
+## SklearnQueryClassifier Objects
+
+```python
+class SklearnQueryClassifier(BaseQueryClassifier)
+```
+
+A node to classify an incoming query into one of two categories using a lightweight sklearn model. Depending on the result, the query flows to a different branch in your pipeline
+and the further processing can be customized. You can define this by connecting the further pipeline to either `output_1` or `output_2` from this node.
+
+**Example**:
+
+ ```python
+ |{
+ |pipe = Pipeline()
+ |pipe.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
+ |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"])
+ |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
+
+ |# Keyword queries will use the ElasticRetriever
+ |pipe.run("kubernetes aws")
+
+ |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever
+ |pipe.run("How to manage kubernetes on aws")
+
+ ```
+
+ Models:
+
+ Pass your own `Sklearn` binary classification model or use one of the following pretrained ones:
+ 1) Keywords vs. Questions/Statements (Default)
+ query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle)
+ query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle)
+ output_1 => question/statement
+ output_2 => keyword query
+ [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt)
+
+
+ 2) Questions vs. Statements
+ query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle)
+ query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle)
+ output_1 => question
+ output_2 => statement
+ [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt)
+
+ See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines.
+
+
+#### \_\_init\_\_
+
+```python
+ | __init__(model_name_or_path: Union[
+ | str, Any
+ | ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", vectorizer_name_or_path: Union[
+ | str, Any
+ | ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle")
+```
+
+**Arguments**:
+
+- `model_name_or_path`: Gradient boosting based binary classifier to classify between keyword vs statement/question
+queries or statement vs question queries.
+- `vectorizer_name_or_path`: A ngram based Tfidf vectorizer for extracting features from query.
+
+
+# Module transformers
+
+
+## TransformersQueryClassifier Objects
+
+```python
+class TransformersQueryClassifier(BaseQueryClassifier)
+```
+
+A node to classify an incoming query into one of two categories using a (small) BERT transformer model.
+Depending on the result, the query flows to a different branch in your pipeline and the further processing
+can be customized. You can define this by connecting the further pipeline to either `output_1` or `output_2`
+from this node.
+
+**Example**:
+
+ ```python
+ |{
+ |pipe = Pipeline()
+ |pipe.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"])
+ |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"])
+ |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
+
+ |# Keyword queries will use the ElasticRetriever
+ |pipe.run("kubernetes aws")
+
+ |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever
+ |pipe.run("How to manage kubernetes on aws")
+
+ ```
+
+ Models:
+
+ Pass your own `Transformer` binary classification model from file/huggingface or use one of the following
+ pretrained ones hosted on Huggingface:
+ 1) Keywords vs. Questions/Statements (Default)
+ model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection"
+ output_1 => question/statement
+ output_2 => keyword query
+ [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt)
+
+
+ 2) Questions vs. Statements
+ `model_name_or_path`="shahrukhx01/question-vs-statement-classifier"
+ output_1 => question
+ output_2 => statement
+ [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt)
+
+
+ See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines.
+
+
+#### \_\_init\_\_
+
+```python
+ | __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", use_gpu: bool = True)
+```
+
+**Arguments**:
+
+- `model_name_or_path`: Transformer based fine tuned mini bert model for query classification
+- `use_gpu`: Whether to use GPU (if available).
+
diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md
index 7be90a35c..014d8d133 100644
--- a/docs/_src/api/api/reader.md
+++ b/docs/_src/api/api/reader.md
@@ -2,7 +2,7 @@
# Module base
-## BaseReader Objects
+## BaseReader
```python
class BaseReader(BaseComponent)
@@ -30,7 +30,7 @@ Wrapper method used to time functions.
# Module farm
-## FARMReader Objects
+## FARMReader
```python
class FARMReader(BaseReader)
@@ -361,7 +361,7 @@ Usage:
# Module transformers
-## TransformersReader Objects
+## TransformersReader
```python
class TransformersReader(BaseReader)
@@ -450,7 +450,7 @@ Dict containing query and answers
# Module table
-## TableReader Objects
+## TableReader
```python
class TableReader(BaseReader)
diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md
index 4992cd99e..a782904c5 100644
--- a/docs/_src/api/api/retriever.md
+++ b/docs/_src/api/api/retriever.md
@@ -2,7 +2,7 @@
# Module base
-## BaseGraphRetriever Objects
+## BaseGraphRetriever
```python
class BaseGraphRetriever(BaseComponent)
@@ -11,7 +11,7 @@ class BaseGraphRetriever(BaseComponent)
Base classfor knowledge graph retrievers.
-## BaseRetriever Objects
+## BaseRetriever
```python
class BaseRetriever(BaseComponent)
@@ -84,7 +84,7 @@ position in the ranking of documents the correct document is.
# Module sparse
-## ElasticsearchRetriever Objects
+## ElasticsearchRetriever
```python
class ElasticsearchRetriever(BaseRetriever)
@@ -152,7 +152,7 @@ that are most relevant to the query.
- `index`: The name of the index in the DocumentStore from which to retrieve documents
-## ElasticsearchFilterOnlyRetriever Objects
+## ElasticsearchFilterOnlyRetriever
```python
class ElasticsearchFilterOnlyRetriever(ElasticsearchRetriever)
@@ -179,7 +179,7 @@ that are most relevant to the query.
- `index`: The name of the index in the DocumentStore from which to retrieve documents
-## TfidfRetriever Objects
+## TfidfRetriever
```python
class TfidfRetriever(BaseRetriever)
@@ -235,7 +235,7 @@ Performing training on this class according to the TF-IDF algorithm.
# Module dense
-## DensePassageRetriever Objects
+## DensePassageRetriever
```python
class DensePassageRetriever(BaseRetriever)
@@ -426,7 +426,7 @@ None
Load DensePassageRetriever from the specified directory.
-## TableTextRetriever Objects
+## TableTextRetriever
```python
class TableTextRetriever(BaseRetriever)
@@ -595,7 +595,7 @@ None
Load TableTextRetriever from the specified directory.
-## EmbeddingRetriever Objects
+## EmbeddingRetriever
```python
class EmbeddingRetriever(BaseRetriever)
@@ -688,7 +688,7 @@ Embeddings, one per input document
# Module text2sparql
-## Text2SparqlRetriever Objects
+## Text2SparqlRetriever
```python
class Text2SparqlRetriever(BaseGraphRetriever)
diff --git a/docs/_src/api/api/summarizer.md b/docs/_src/api/api/summarizer.md
index 35b00e11e..40cfc61fe 100644
--- a/docs/_src/api/api/summarizer.md
+++ b/docs/_src/api/api/summarizer.md
@@ -2,7 +2,7 @@
# Module base
-## BaseSummarizer Objects
+## BaseSummarizer
```python
class BaseSummarizer(BaseComponent)
@@ -37,7 +37,7 @@ List of Documents, where Document.text contains the summarization and Document.m
# Module transformers
-## TransformersSummarizer Objects
+## TransformersSummarizer
```python
class TransformersSummarizer(BaseSummarizer)
diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md
index 0481bc3bd..0c6db3079 100644
--- a/docs/_src/api/api/translator.md
+++ b/docs/_src/api/api/translator.md
@@ -2,7 +2,7 @@
# Module base
-## BaseTranslator Objects
+## BaseTranslator
```python
class BaseTranslator(BaseComponent)
@@ -33,7 +33,7 @@ Method that gets executed when this class is used as a Node in a Haystack Pipeli
# Module transformers
-## TransformersTranslator Objects
+## TransformersTranslator
```python
class TransformersTranslator(BaseTranslator)