diff --git a/docs/_src/api/api/crawler.md b/docs/_src/api/api/crawler.md index 66f3452d3..c3d0db4a9 100644 --- a/docs/_src/api/api/crawler.md +++ b/docs/_src/api/api/crawler.md @@ -1,53 +1,95 @@ - -# Module entity + +# Module crawler - -## EntityExtractor Objects + +## Crawler ```python -class EntityExtractor(BaseComponent) +class Crawler(BaseComponent) ``` -This node is used to extract entities out of documents. -The most common use case for this would be as a named entity extractor. -The default model used is dslim/bert-base-NER. -This node can be placed in a querying pipeline to perform entity extraction on retrieved documents only, -or it can be placed in an indexing pipeline so that all documents in the document store have extracted entities. -The entities extracted by this Node will populate Document.entities +Crawl texts from a website so that we can use them later in Haystack as a corpus for search / question answering etc. - +**Example:** +```python +| from haystack.nodes.connector import Crawler +| +| crawler = Crawler(output_dir="crawled_files") +| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/ +| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"], +| filter_urls= ["haystack\.deepset\.ai\/overview\/"]) +``` + + +#### \_\_init\_\_ + +```python + | __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True) +``` + +Init object with basic params for crawling (can be overwritten later). + +**Arguments**: + +- `output_dir`: Path for the directory to store files +- `urls`: List of http(s) address(es) (can also be supplied later when calling crawl()) +- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options: + 0: Only initial list of urls + 1: Follow links found on the initial URLs (but no further) +- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. + All URLs not matching at least one of the regular expressions will be dropped. +- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content + + +#### crawl + +```python + | crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None) -> List[Path] +``` + +Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON +file per URL, including text and basic meta data). +You can optionally specify via `filter_urls` to only crawl URLs that match a certain pattern. +All parameters are optional here and only meant to overwrite instance attributes at runtime. +If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used. + +**Arguments**: + +- `output_dir`: Path for the directory to store files +- `urls`: List of http addresses or single http address +- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options: + 0: Only initial list of urls + 1: Follow links found on the initial URLs (but no further) +- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. + All URLs not matching at least one of the regular expressions will be dropped. +- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content + +**Returns**: + +List of paths where the crawled webpages got stored + + #### run ```python - | run(documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str] + | run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False) -> Tuple[Dict, str] ``` -This is the method called when this node is used in a pipeline +Method to be executed when the Crawler is used as a Node within a Haystack pipeline. - -#### extract +**Arguments**: -```python - | extract(text) -``` +- `output_dir`: Path for the directory to store files +- `urls`: List of http addresses or single http address +- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options: + 0: Only initial list of urls + 1: Follow links found on the initial URLs (but no further) +- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. + All URLs not matching at least one of the regular expressions will be dropped. +- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content +- `return_documents`: Return json files content -This function can be called to perform entity extraction when using the node in isolation. +**Returns**: - -#### simplify\_ner\_for\_qa - -```python -simplify_ner_for_qa(output) -``` - -Returns a simplified version of the output dictionary -with the following structure: -[ - { - answer: { ... } - entities: [ { ... }, {} ] - } -] -The entities included are only the ones that overlap with -the answer itself. +Tuple({"paths": List of filepaths, ...}, Name of output edge) diff --git a/docs/_src/api/api/document_store.md b/docs/_src/api/api/document_store.md index 53fb9b3fc..3db306c5b 100644 --- a/docs/_src/api/api/document_store.md +++ b/docs/_src/api/api/document_store.md @@ -2,7 +2,7 @@ # Module base -## BaseKnowledgeGraph Objects +## BaseKnowledgeGraph ```python class BaseKnowledgeGraph(BaseComponent) @@ -11,7 +11,7 @@ class BaseKnowledgeGraph(BaseComponent) Base class for implementing Knowledge Graphs. -## BaseDocumentStore Objects +## BaseDocumentStore ```python class BaseDocumentStore(BaseComponent) @@ -150,7 +150,7 @@ Batch elements of an iterable into fixed-length chunks or blocks. # Module elasticsearch -## ElasticsearchDocumentStore Objects +## ElasticsearchDocumentStore ```python class ElasticsearchDocumentStore(BaseDocumentStore) @@ -530,7 +530,7 @@ Delete labels in an index. All labels are deleted if no filters are passed. None -## OpenSearchDocumentStore Objects +## OpenSearchDocumentStore ```python class OpenSearchDocumentStore(ElasticsearchDocumentStore) @@ -564,7 +564,7 @@ Find the document that is most similar to the provided `query_emb` by using a ve -## OpenDistroElasticsearchDocumentStore Objects +## OpenDistroElasticsearchDocumentStore ```python class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore) @@ -576,7 +576,7 @@ A DocumentStore which has an Open Distro for Elasticsearch service behind it. # Module memory -## InMemoryDocumentStore Objects +## InMemoryDocumentStore ```python class InMemoryDocumentStore(BaseDocumentStore) @@ -857,7 +857,7 @@ None # Module sql -## SQLDocumentStore Objects +## SQLDocumentStore ```python class SQLDocumentStore(BaseDocumentStore) @@ -1099,7 +1099,7 @@ None # Module faiss -## FAISSDocumentStore Objects +## FAISSDocumentStore ```python class FAISSDocumentStore(SQLDocumentStore) @@ -1368,7 +1368,7 @@ Note: In order to have a correct mapping from FAISS to SQL, # Module milvus -## MilvusDocumentStore Objects +## MilvusDocumentStore ```python class MilvusDocumentStore(SQLDocumentStore) @@ -1660,7 +1660,7 @@ Return the count of embeddings in the document store. # Module weaviate -## WeaviateDocumentStore Objects +## WeaviateDocumentStore ```python class WeaviateDocumentStore(BaseDocumentStore) @@ -1947,7 +1947,7 @@ None # Module graphdb -## GraphDBKnowledgeGraph Objects +## GraphDBKnowledgeGraph ```python class GraphDBKnowledgeGraph(BaseKnowledgeGraph) diff --git a/docs/_src/api/api/extractor.md b/docs/_src/api/api/extractor.md new file mode 100644 index 000000000..5e443c6d6 --- /dev/null +++ b/docs/_src/api/api/extractor.md @@ -0,0 +1,53 @@ + +# Module entity + + +## EntityExtractor + +```python +class EntityExtractor(BaseComponent) +``` + +This node is used to extract entities out of documents. +The most common use case for this would be as a named entity extractor. +The default model used is dslim/bert-base-NER. +This node can be placed in a querying pipeline to perform entity extraction on retrieved documents only, +or it can be placed in an indexing pipeline so that all documents in the document store have extracted entities. +The entities extracted by this Node will populate Document.entities + + +#### run + +```python + | run(documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str] +``` + +This is the method called when this node is used in a pipeline + + +#### extract + +```python + | extract(text) +``` + +This function can be called to perform entity extraction when using the node in isolation. + + +#### simplify\_ner\_for\_qa + +```python +simplify_ner_for_qa(output) +``` + +Returns a simplified version of the output dictionary +with the following structure: +[ + { + answer: { ... } + entities: [ { ... }, {} ] + } +] +The entities included are only the ones that overlap with +the answer itself. + diff --git a/docs/_src/api/api/file_converter.md b/docs/_src/api/api/file_converter.md index a70d31c3b..5df5b8ed7 100644 --- a/docs/_src/api/api/file_converter.md +++ b/docs/_src/api/api/file_converter.md @@ -2,7 +2,7 @@ # Module file\_type -## FileTypeClassifier Objects +## FileTypeClassifier ```python class FileTypeClassifier(BaseComponent) diff --git a/docs/_src/api/api/generate_docstrings.sh b/docs/_src/api/api/generate_docstrings.sh index 54b2ec0c2..de013a7ea 100755 --- a/docs/_src/api/api/generate_docstrings.sh +++ b/docs/_src/api/api/generate_docstrings.sh @@ -2,6 +2,7 @@ # Purpose : Automate the generation of docstrings +pydoc-markdown pydoc-markdown-primitives.yml pydoc-markdown pydoc-markdown-document-store.yml pydoc-markdown pydoc-markdown-file-converters.yml pydoc-markdown pydoc-markdown-file-classifier.yml @@ -18,5 +19,6 @@ pydoc-markdown pydoc-markdown-pipelines.yml pydoc-markdown pydoc-markdown-evaluation.yml pydoc-markdown pydoc-markdown-ranker.yml pydoc-markdown pydoc-markdown-question-generator.yml +pydoc-markdown pydoc-markdown-query-classifier.yml pydoc-markdown pydoc-markdown-document-classifier.yml diff --git a/docs/_src/api/api/generator.md b/docs/_src/api/api/generator.md index 64b1924b5..f19eec743 100644 --- a/docs/_src/api/api/generator.md +++ b/docs/_src/api/api/generator.md @@ -2,7 +2,7 @@ # Module base -## BaseGenerator Objects +## BaseGenerator ```python class BaseGenerator(BaseComponent) @@ -34,7 +34,7 @@ Generated answers plus additional infos in a dict # Module transformers -## RAGenerator Objects +## RAGenerator ```python class RAGenerator(BaseGenerator) @@ -140,7 +140,7 @@ Generated answers plus additional infos in a dict like this: ``` -## Seq2SeqGenerator Objects +## Seq2SeqGenerator ```python class Seq2SeqGenerator(BaseGenerator) diff --git a/docs/_src/api/api/other.md b/docs/_src/api/api/other.md new file mode 100644 index 000000000..a55694762 --- /dev/null +++ b/docs/_src/api/api/other.md @@ -0,0 +1,47 @@ + +# Module docs2answers + + +## Docs2Answers + +```python +class Docs2Answers(BaseComponent) +``` + +This Node is used to convert retrieved documents into predicted answers format. +It is useful for situations where you are calling a Retriever only pipeline via REST API. +This ensures that your output is in a compatible format. + + +# Module join\_docs + + +## JoinDocuments + +```python +class JoinDocuments(BaseComponent) +``` + +A node to join documents outputted by multiple retriever nodes. + +The node allows multiple join modes: +* concatenate: combine the documents from multiple nodes. Any duplicate documents are discarded. +* merge: merge scores of documents from multiple nodes. Optionally, each input score can be given a different + `weight` & a `top_k` limit can be set. This mode can also be used for "reranking" retrieved documents. + + +#### \_\_init\_\_ + +```python + | __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None) +``` + +**Arguments**: + +- `join_mode`: `concatenate` to combine documents from multiple retrievers or `merge` to aggregate scores of + individual documents. +- `weights`: A node-wise list(length of list must be equal to the number of input nodes) of weights for + adjusting document scores when using the `merge` join_mode. By default, equal weight is given + to each retriever score. This param is not compatible with the `concatenate` join_mode. +- `top_k_join`: Limit documents to top_k based on the resulting scores of the join. + diff --git a/docs/_src/api/api/preprocessor.md b/docs/_src/api/api/preprocessor.md index 2ac35ef7f..30c17182b 100644 --- a/docs/_src/api/api/preprocessor.md +++ b/docs/_src/api/api/preprocessor.md @@ -2,7 +2,7 @@ # Module base -## BasePreProcessor Objects +## BasePreProcessor ```python class BasePreProcessor(BaseComponent) @@ -21,7 +21,7 @@ Perform document cleaning and splitting. Takes a single document as input and re # Module preprocessor -## PreProcessor Objects +## PreProcessor ```python class PreProcessor(BasePreProcessor) diff --git a/docs/_src/api/api/primitives.md b/docs/_src/api/api/primitives.md new file mode 100644 index 000000000..294d75afa --- /dev/null +++ b/docs/_src/api/api/primitives.md @@ -0,0 +1,232 @@ + +# Module schema + + +## Document + +```python +@dataclass +class Document() +``` + + +#### \_\_init\_\_ + +```python + | __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None) +``` + +One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. +Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in +many other places that manipulate or interact with document-level data. + +Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text +into smaller passages. We'll have one Document per passage in this case. + +Each document has a unique ID. This can be supplied by the user or generated automatically. +It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels) + +There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`. + +**Arguments**: + +- `content`: Content of the document. For most cases, this will be text, but it can be a table or image. +- `content_type`: One of "image", "table" or "image". Haystack components can use this to adjust their + handling of Documents and check compatibility. +- `id`: Unique ID for the document. If not supplied by the user, we'll generate one automatically by + creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`. +- `score`: The relevance score of the Document determined by a model (e.g. Retriever or Re-Ranker). + In the range of [0,1], where 1 means extremely relevant. +- `meta`: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed). +- `embedding`: Vector encoding of the text +- `id_hash_keys`: Generate the document id from a custom list of strings. + If you want ensure you don't have duplicate documents in your DocumentStore but texts are + not unique, you can provide custom strings here that will be used (e.g. ["filename_xy", "text_of_doc"]. + + +#### to\_dict + +```python + | to_dict(field_map={}) -> Dict +``` + +Convert Document to dict. An optional field_map can be supplied to change the names of the keys in the +resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that +they are serialized / stored in other places (e.g. elasticsearch) +Example: +| doc = Document(content="some text", content_type="text") +| doc.to_dict(field_map={"custom_content_field": "content"}) +| >>> {"custom_content_field": "some text", content_type": "text"} + +**Arguments**: + +- `field_map`: Dict with keys being the custom target keys and values being the standard Document attributes + +**Returns**: + +dict with content of the Document + + +#### from\_dict + +```python + | @classmethod + | from_dict(cls, dict, field_map={}) +``` + +Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the +input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that +they are serialized / stored in other places (e.g. elasticsearch) +Example: +| my_dict = {"custom_content_field": "some text", content_type": "text"} +| Document.from_dict(my_dict, field_map={"custom_content_field": "content"}) + +**Arguments**: + +- `field_map`: Dict with keys being the custom target keys and values being the standard Document attributes + +**Returns**: + +dict with content of the Document + + +#### \_\_lt\_\_ + +```python + | __lt__(other) +``` + +Enable sorting of Documents by score + + +## Span + +```python +@dataclass +class Span() +``` + + +#### end + +Defining a sequence of characters (Text span) or cells (Table span) via start and end index. +For extractive QA: Character where answer starts/ends +For TableQA: Cell where the answer starts/ends (counted from top left to bottom right of table) + +**Arguments**: + +- `start`: Position where the span starts +- `end`: Position where the spand ends + + +## Answer + +```python +@dataclass +class Answer() +``` + + +#### meta + +The fundamental object in Haystack to represent any type of Answers (e.g. extractive QA, generative QA or TableQA). +For example, it's used within some Nodes like the Reader, but also in the REST API. + +**Arguments**: + +- `answer`: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string. +- `type`: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model + (i.e. we can locate an exact answer string in one of the documents) or from a generative model + (i.e. no pointer to a specific document, no offsets ...). +- `score`: The relevance score of the Answer determined by a model (e.g. Reader or Generator). + In the range of [0,1], where 1 means extremely relevant. +- `context`: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...) +- `offsets_in_document`: List of `Span` objects with start and end positions of the answer **in the + document** (as stored in the document store). + For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start + For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start + (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here) +- `offsets_in_context`: List of `Span` objects with start and end positions of the answer **in the + context** (i.e. the surrounding text/table of a certain window size). + For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start + For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start + (Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here) +- `document_id`: ID of the document that the answer was located it (if any) +- `meta`: Dict that can be used to associate any kind of custom meta data with the answer. + In extractive QA, this will carry the meta data of the document where the answer was found. + + +#### \_\_lt\_\_ + +```python + | __lt__(other) +``` + +Enable sorting of Answers by score + + +## Label + +```python +@dataclass +class Label() +``` + + +#### \_\_init\_\_ + +```python + | __init__(query: str, document: Document, is_correct_answer: bool, is_correct_document: bool, origin: Literal["user-feedback", "gold-label"], answer: Optional[Answer], id: Optional[str] = None, no_answer: Optional[bool] = None, pipeline_id: Optional[str] = None, created_at: Optional[str] = None, updated_at: Optional[str] = None, meta: Optional[dict] = None) +``` + +Object used to represent label/feedback in a standardized way within Haystack. +This includes labels from dataset like SQuAD, annotations from labeling tools, +or, user-feedback from the Haystack REST API. + +**Arguments**: + +- `query`: the question (or query) for finding answers. +- `document`: +- `answer`: the answer object. +- `is_correct_answer`: whether the sample is positive or negative. +- `is_correct_document`: in case of negative sample(is_correct_answer is False), there could be two cases; + incorrect answer but correct document & incorrect document. This flag denotes if + the returned document was correct. +- `origin`: the source for the labels. It can be used to later for filtering. +- `id`: Unique ID used within the DocumentStore. If not supplied, a uuid will be generated automatically. +- `no_answer`: whether the question in unanswerable. +- `pipeline_id`: pipeline identifier (any str) that was involved for generating this label (in-case of user feedback). +- `created_at`: Timestamp of creation with format yyyy-MM-dd HH:mm:ss. + Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S"). +- `created_at`: Timestamp of update with format yyyy-MM-dd HH:mm:ss. + Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S") +- `meta`: Meta fields like "annotator_name" in the form of a custom dict (any keys and values allowed). + + +## MultiLabel + +```python +@dataclass +class MultiLabel() +``` + + +#### \_\_init\_\_ + +```python + | __init__(labels: List[Label], drop_negative_labels=False, drop_no_answers=False) +``` + +There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated +answers for one question or multiple documents contain the information you want for a query. +This class is "syntactic sugar" that simplifies the work with such a list of related Labels. +It stored the original labels in MultiLabel.labels and provides additional aggregated attributes that are +automatically created at init time. For example, MultiLabel.no_answer allows you to easily access if any of the +underlying Labels provided a text answer and therefore demonstrates that there is indeed a possible answer. + +**Arguments**: + +- `labels`: A list lof labels that belong to a similar query and shall be "grouped" together +- `drop_negative_labels`: Whether to drop negative labels from that group (e.g. thumbs down feedback from UI) +- `drop_no_answers`: Whether to drop labels that specify the answer is impossible + diff --git a/docs/_src/api/api/pydoc-markdown-answer-generator.yml b/docs/_src/api/api/pydoc-markdown-answer-generator.yml index 623dbd309..a43a0238c 100644 --- a/docs/_src/api/api/pydoc-markdown-answer-generator.yml +++ b/docs/_src/api/api/pydoc-markdown-answer-generator.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-crawler.yml b/docs/_src/api/api/pydoc-markdown-crawler.yml index 584010c7e..02c3be3ae 100644 --- a/docs/_src/api/api/pydoc-markdown-crawler.yml +++ b/docs/_src/api/api/pydoc-markdown-crawler.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-document-classifier.yml b/docs/_src/api/api/pydoc-markdown-document-classifier.yml index b808a0f68..33f880eeb 100644 --- a/docs/_src/api/api/pydoc-markdown-document-classifier.yml +++ b/docs/_src/api/api/pydoc-markdown-document-classifier.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-document-store.yml b/docs/_src/api/api/pydoc-markdown-document-store.yml index 568e6d8a8..b15d81cb6 100644 --- a/docs/_src/api/api/pydoc-markdown-document-store.yml +++ b/docs/_src/api/api/pydoc-markdown-document-store.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-evaluation.yml b/docs/_src/api/api/pydoc-markdown-evaluation.yml index 227027338..0b19d3cd4 100644 --- a/docs/_src/api/api/pydoc-markdown-evaluation.yml +++ b/docs/_src/api/api/pydoc-markdown-evaluation.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-extractor.yml b/docs/_src/api/api/pydoc-markdown-extractor.yml index 7730a5ebe..7f563ae19 100644 --- a/docs/_src/api/api/pydoc-markdown-extractor.yml +++ b/docs/_src/api/api/pydoc-markdown-extractor.yml @@ -11,8 +11,8 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false - filename: crawler.md + filename: extractor.md diff --git a/docs/_src/api/api/pydoc-markdown-file-classifier.yml b/docs/_src/api/api/pydoc-markdown-file-classifier.yml index f1012c9ea..3f3a6278c 100644 --- a/docs/_src/api/api/pydoc-markdown-file-classifier.yml +++ b/docs/_src/api/api/pydoc-markdown-file-classifier.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-file-converters.yml b/docs/_src/api/api/pydoc-markdown-file-converters.yml index ab3f36c27..831c23536 100644 --- a/docs/_src/api/api/pydoc-markdown-file-converters.yml +++ b/docs/_src/api/api/pydoc-markdown-file-converters.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-other.yml b/docs/_src/api/api/pydoc-markdown-other.yml index 00c094806..a2065a708 100644 --- a/docs/_src/api/api/pydoc-markdown-other.yml +++ b/docs/_src/api/api/pydoc-markdown-other.yml @@ -11,8 +11,8 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false - filename: ranker.md + filename: other.md diff --git a/docs/_src/api/api/pydoc-markdown-pipelines.yml b/docs/_src/api/api/pydoc-markdown-pipelines.yml index ace599894..aa8272680 100644 --- a/docs/_src/api/api/pydoc-markdown-pipelines.yml +++ b/docs/_src/api/api/pydoc-markdown-pipelines.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-preprocessor.yml b/docs/_src/api/api/pydoc-markdown-preprocessor.yml index 2d698004b..677bbcbc2 100644 --- a/docs/_src/api/api/pydoc-markdown-preprocessor.yml +++ b/docs/_src/api/api/pydoc-markdown-preprocessor.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-primitives.yml b/docs/_src/api/api/pydoc-markdown-primitives.yml new file mode 100644 index 000000000..87a02fab3 --- /dev/null +++ b/docs/_src/api/api/pydoc-markdown-primitives.yml @@ -0,0 +1,18 @@ +loaders: + - type: python + search_path: [../../../../haystack/] + modules: ['schema'] + ignore_when_discovered: ['__init__'] +processor: + - type: filter + expression: not name.startswith('_') and default() + - documented_only: true + - do_not_filter_modules: false + - skip_empty_modules: true +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: primitives.md diff --git a/docs/_src/api/api/pydoc-markdown-query-classifier.yml b/docs/_src/api/api/pydoc-markdown-query-classifier.yml new file mode 100644 index 000000000..85a761dd1 --- /dev/null +++ b/docs/_src/api/api/pydoc-markdown-query-classifier.yml @@ -0,0 +1,18 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/query_classifier] + modules: ['base', 'sklearn', 'transformers'] + ignore_when_discovered: ['__init__'] +processor: + - type: filter + expression: not name.startswith('_') and default() + - documented_only: true + - do_not_filter_modules: false + - skip_empty_modules: true +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: false + add_member_class_prefix: false + filename: query_classifier.md diff --git a/docs/_src/api/api/pydoc-markdown-question-generator.yml b/docs/_src/api/api/pydoc-markdown-question-generator.yml index b09453450..671313a03 100644 --- a/docs/_src/api/api/pydoc-markdown-question-generator.yml +++ b/docs/_src/api/api/pydoc-markdown-question-generator.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-ranker.yml b/docs/_src/api/api/pydoc-markdown-ranker.yml index 6cce982ef..f1a55dddc 100644 --- a/docs/_src/api/api/pydoc-markdown-ranker.yml +++ b/docs/_src/api/api/pydoc-markdown-ranker.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-reader.yml b/docs/_src/api/api/pydoc-markdown-reader.yml index 9b9bd3c28..ba2fe47da 100644 --- a/docs/_src/api/api/pydoc-markdown-reader.yml +++ b/docs/_src/api/api/pydoc-markdown-reader.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-retriever.yml b/docs/_src/api/api/pydoc-markdown-retriever.yml index b6a6c7870..b0daf5326 100644 --- a/docs/_src/api/api/pydoc-markdown-retriever.yml +++ b/docs/_src/api/api/pydoc-markdown-retriever.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-summarizer.yml b/docs/_src/api/api/pydoc-markdown-summarizer.yml index c90744e31..09cee2927 100644 --- a/docs/_src/api/api/pydoc-markdown-summarizer.yml +++ b/docs/_src/api/api/pydoc-markdown-summarizer.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/pydoc-markdown-translator.yml b/docs/_src/api/api/pydoc-markdown-translator.yml index f25a7ead9..0bcf3dd65 100644 --- a/docs/_src/api/api/pydoc-markdown-translator.yml +++ b/docs/_src/api/api/pydoc-markdown-translator.yml @@ -11,7 +11,7 @@ processor: - skip_empty_modules: true renderer: type: markdown - descriptive_class_title: true + descriptive_class_title: false descriptive_module_title: true add_method_class_prefix: false add_member_class_prefix: false diff --git a/docs/_src/api/api/query_classifier.md b/docs/_src/api/api/query_classifier.md new file mode 100644 index 000000000..d1ce77464 --- /dev/null +++ b/docs/_src/api/api/query_classifier.md @@ -0,0 +1,143 @@ + +# Module base + + +## BaseQueryClassifier Objects + +```python +class BaseQueryClassifier(BaseComponent) +``` + +Abstract class for Query Classifiers + + +# Module sklearn + + +## SklearnQueryClassifier Objects + +```python +class SklearnQueryClassifier(BaseQueryClassifier) +``` + +A node to classify an incoming query into one of two categories using a lightweight sklearn model. Depending on the result, the query flows to a different branch in your pipeline +and the further processing can be customized. You can define this by connecting the further pipeline to either `output_1` or `output_2` from this node. + +**Example**: + + ```python + |{ + |pipe = Pipeline() + |pipe.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"]) + |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"]) + |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) + + |# Keyword queries will use the ElasticRetriever + |pipe.run("kubernetes aws") + + |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever + |pipe.run("How to manage kubernetes on aws") + + ``` + + Models: + + Pass your own `Sklearn` binary classification model or use one of the following pretrained ones: + 1) Keywords vs. Questions/Statements (Default) + query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle) + query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle) + output_1 => question/statement + output_2 => keyword query + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + + 2) Questions vs. Statements + query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle) + query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle) + output_1 => question + output_2 => statement + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) + + See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines. + + +#### \_\_init\_\_ + +```python + | __init__(model_name_or_path: Union[ + | str, Any + | ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", vectorizer_name_or_path: Union[ + | str, Any + | ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle") +``` + +**Arguments**: + +- `model_name_or_path`: Gradient boosting based binary classifier to classify between keyword vs statement/question +queries or statement vs question queries. +- `vectorizer_name_or_path`: A ngram based Tfidf vectorizer for extracting features from query. + + +# Module transformers + + +## TransformersQueryClassifier Objects + +```python +class TransformersQueryClassifier(BaseQueryClassifier) +``` + +A node to classify an incoming query into one of two categories using a (small) BERT transformer model. +Depending on the result, the query flows to a different branch in your pipeline and the further processing +can be customized. You can define this by connecting the further pipeline to either `output_1` or `output_2` +from this node. + +**Example**: + + ```python + |{ + |pipe = Pipeline() + |pipe.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) + |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"]) + |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) + + |# Keyword queries will use the ElasticRetriever + |pipe.run("kubernetes aws") + + |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever + |pipe.run("How to manage kubernetes on aws") + + ``` + + Models: + + Pass your own `Transformer` binary classification model from file/huggingface or use one of the following + pretrained ones hosted on Huggingface: + 1) Keywords vs. Questions/Statements (Default) + model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection" + output_1 => question/statement + output_2 => keyword query + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + + 2) Questions vs. Statements + `model_name_or_path`="shahrukhx01/question-vs-statement-classifier" + output_1 => question + output_2 => statement + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) + + + See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines. + + +#### \_\_init\_\_ + +```python + | __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", use_gpu: bool = True) +``` + +**Arguments**: + +- `model_name_or_path`: Transformer based fine tuned mini bert model for query classification +- `use_gpu`: Whether to use GPU (if available). + diff --git a/docs/_src/api/api/reader.md b/docs/_src/api/api/reader.md index 7be90a35c..014d8d133 100644 --- a/docs/_src/api/api/reader.md +++ b/docs/_src/api/api/reader.md @@ -2,7 +2,7 @@ # Module base -## BaseReader Objects +## BaseReader ```python class BaseReader(BaseComponent) @@ -30,7 +30,7 @@ Wrapper method used to time functions. # Module farm -## FARMReader Objects +## FARMReader ```python class FARMReader(BaseReader) @@ -361,7 +361,7 @@ Usage: # Module transformers -## TransformersReader Objects +## TransformersReader ```python class TransformersReader(BaseReader) @@ -450,7 +450,7 @@ Dict containing query and answers # Module table -## TableReader Objects +## TableReader ```python class TableReader(BaseReader) diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index 4992cd99e..a782904c5 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -2,7 +2,7 @@ # Module base -## BaseGraphRetriever Objects +## BaseGraphRetriever ```python class BaseGraphRetriever(BaseComponent) @@ -11,7 +11,7 @@ class BaseGraphRetriever(BaseComponent) Base classfor knowledge graph retrievers. -## BaseRetriever Objects +## BaseRetriever ```python class BaseRetriever(BaseComponent) @@ -84,7 +84,7 @@ position in the ranking of documents the correct document is. # Module sparse -## ElasticsearchRetriever Objects +## ElasticsearchRetriever ```python class ElasticsearchRetriever(BaseRetriever) @@ -152,7 +152,7 @@ that are most relevant to the query. - `index`: The name of the index in the DocumentStore from which to retrieve documents -## ElasticsearchFilterOnlyRetriever Objects +## ElasticsearchFilterOnlyRetriever ```python class ElasticsearchFilterOnlyRetriever(ElasticsearchRetriever) @@ -179,7 +179,7 @@ that are most relevant to the query. - `index`: The name of the index in the DocumentStore from which to retrieve documents -## TfidfRetriever Objects +## TfidfRetriever ```python class TfidfRetriever(BaseRetriever) @@ -235,7 +235,7 @@ Performing training on this class according to the TF-IDF algorithm. # Module dense -## DensePassageRetriever Objects +## DensePassageRetriever ```python class DensePassageRetriever(BaseRetriever) @@ -426,7 +426,7 @@ None Load DensePassageRetriever from the specified directory. -## TableTextRetriever Objects +## TableTextRetriever ```python class TableTextRetriever(BaseRetriever) @@ -595,7 +595,7 @@ None Load TableTextRetriever from the specified directory. -## EmbeddingRetriever Objects +## EmbeddingRetriever ```python class EmbeddingRetriever(BaseRetriever) @@ -688,7 +688,7 @@ Embeddings, one per input document # Module text2sparql -## Text2SparqlRetriever Objects +## Text2SparqlRetriever ```python class Text2SparqlRetriever(BaseGraphRetriever) diff --git a/docs/_src/api/api/summarizer.md b/docs/_src/api/api/summarizer.md index 35b00e11e..40cfc61fe 100644 --- a/docs/_src/api/api/summarizer.md +++ b/docs/_src/api/api/summarizer.md @@ -2,7 +2,7 @@ # Module base -## BaseSummarizer Objects +## BaseSummarizer ```python class BaseSummarizer(BaseComponent) @@ -37,7 +37,7 @@ List of Documents, where Document.text contains the summarization and Document.m # Module transformers -## TransformersSummarizer Objects +## TransformersSummarizer ```python class TransformersSummarizer(BaseSummarizer) diff --git a/docs/_src/api/api/translator.md b/docs/_src/api/api/translator.md index 0481bc3bd..0c6db3079 100644 --- a/docs/_src/api/api/translator.md +++ b/docs/_src/api/api/translator.md @@ -2,7 +2,7 @@ # Module base -## BaseTranslator Objects +## BaseTranslator ```python class BaseTranslator(BaseComponent) @@ -33,7 +33,7 @@ Method that gets executed when this class is used as a Node in a Haystack Pipeli # Module transformers -## TransformersTranslator Objects +## TransformersTranslator ```python class TransformersTranslator(BaseTranslator)