mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-25 14:08:27 +00:00
Add api md (#631)
This commit is contained in:
parent
9fbd845ef3
commit
5e5dba9587
@ -12,7 +12,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(host: str = "localhost", port: int = 9200, username: str = "", password: str = "", index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: bool = False, verify_certs: bool = True, create_index: bool = True, update_existing_documents: bool = False, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: Optional[bool] = True)
|
||||
| __init__(host: str = "localhost", port: int = 9200, username: str = "", password: str = "", index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: bool = False, verify_certs: bool = True, create_index: bool = True, update_existing_documents: bool = False, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: bool = False)
|
||||
```
|
||||
|
||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||
@ -49,20 +49,36 @@ documents. When set as True, any document with an existing ID gets updated.
|
||||
If set to False, an error is raised if the document ID of the document being
|
||||
added already exists.
|
||||
- `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
|
||||
Values:
|
||||
- 'wait_for' => continue only after changes are visible (slow, but safe)
|
||||
- 'false' => continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion)
|
||||
If set to 'wait_for', continue only after changes are visible (slow, but safe).
|
||||
If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion).
|
||||
More info at https://www.elastic.co/guide/en/elasticsearch/reference/6.8/docs-refresh.html
|
||||
- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default sine it is
|
||||
more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
|
||||
- `timeout`: Number of seconds after which an ElasticSearch request times out.
|
||||
- `return_embedding`: To return document embedding
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_document_by_id"></a>
|
||||
#### get\_document\_by\_id
|
||||
|
||||
```python
|
||||
| get_document_by_id(id: str, index=None) -> Optional[Document]
|
||||
```
|
||||
|
||||
Fetch a document by specifying its text id string
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_documents_by_id"></a>
|
||||
#### get\_documents\_by\_id
|
||||
|
||||
```python
|
||||
| get_documents_by_id(ids: List[str], index=None) -> List[Document]
|
||||
```
|
||||
|
||||
Fetch documents by specifying a list of text id strings
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.write_documents"></a>
|
||||
#### write\_documents
|
||||
|
||||
```python
|
||||
| @abstractmethod
|
||||
| write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None)
|
||||
```
|
||||
|
||||
@ -90,6 +106,125 @@ should be changed to what you have set for self.text_field and self.name_field.
|
||||
|
||||
None
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.write_labels"></a>
|
||||
#### write\_labels
|
||||
|
||||
```python
|
||||
| write_labels(labels: Union[List[Label], List[dict]], index: Optional[str] = None)
|
||||
```
|
||||
|
||||
Write annotation labels into document store.
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.update_document_meta"></a>
|
||||
#### update\_document\_meta
|
||||
|
||||
```python
|
||||
| update_document_meta(id: str, meta: Dict[str, str])
|
||||
```
|
||||
|
||||
Update the metadata dictionary of a document by specifying its string id
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_document_count"></a>
|
||||
#### get\_document\_count
|
||||
|
||||
```python
|
||||
| get_document_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int
|
||||
```
|
||||
|
||||
Return the number of documents in the document store.
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_label_count"></a>
|
||||
#### get\_label\_count
|
||||
|
||||
```python
|
||||
| get_label_count(index: Optional[str] = None) -> int
|
||||
```
|
||||
|
||||
Return the number of labels in the document store
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_all_documents"></a>
|
||||
#### get\_all\_documents
|
||||
|
||||
```python
|
||||
| get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Get documents from the document store.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
- `filters`: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
- `return_embedding`: Whether to return the document embeddings.
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_all_labels"></a>
|
||||
#### get\_all\_labels
|
||||
|
||||
```python
|
||||
| get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]
|
||||
```
|
||||
|
||||
Return all labels in the document store
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.get_all_documents_in_index"></a>
|
||||
#### get\_all\_documents\_in\_index
|
||||
|
||||
```python
|
||||
| get_all_documents_in_index(index: str, filters: Optional[Dict[str, List[str]]] = None) -> List[dict]
|
||||
```
|
||||
|
||||
Return all documents in a specific index in the document store
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.query"></a>
|
||||
#### query
|
||||
|
||||
```python
|
||||
| query(query: Optional[str], filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
that are most relevant to the query as defined by the BM25 algorithm.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: The query
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.query_by_embedding"></a>
|
||||
#### query\_by\_embedding
|
||||
|
||||
```python
|
||||
| query_by_embedding(query_emb: np.array, filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query_emb`: Embedding of the query (e.g. gathered from DPR)
|
||||
- `filters`: Optional filters to narrow down the search space.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
- `top_k`: How many documents to return
|
||||
- `index`: Index name for storing the docs and metadata
|
||||
- `return_embedding`: To return document embedding
|
||||
|
||||
**Returns**:
|
||||
|
||||
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.describe_documents"></a>
|
||||
#### describe\_documents
|
||||
|
||||
```python
|
||||
| describe_documents(index=None)
|
||||
```
|
||||
|
||||
Return a summary of the documents in the document store
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.update_embeddings"></a>
|
||||
#### update\_embeddings
|
||||
|
||||
@ -181,6 +316,55 @@ separate index than the documents for search.
|
||||
|
||||
None
|
||||
|
||||
<a name="memory.InMemoryDocumentStore.write_labels"></a>
|
||||
#### write\_labels
|
||||
|
||||
```python
|
||||
| write_labels(labels: Union[List[dict], List[Label]], index: Optional[str] = None)
|
||||
```
|
||||
|
||||
Write annotation labels into document store.
|
||||
|
||||
<a name="memory.InMemoryDocumentStore.get_document_by_id"></a>
|
||||
#### get\_document\_by\_id
|
||||
|
||||
```python
|
||||
| get_document_by_id(id: str, index: Optional[str] = None) -> Optional[Document]
|
||||
```
|
||||
|
||||
Fetch a document by specifying its text id string
|
||||
|
||||
<a name="memory.InMemoryDocumentStore.get_documents_by_id"></a>
|
||||
#### get\_documents\_by\_id
|
||||
|
||||
```python
|
||||
| get_documents_by_id(ids: List[str], index: Optional[str] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Fetch documents by specifying a list of text id strings
|
||||
|
||||
<a name="memory.InMemoryDocumentStore.query_by_embedding"></a>
|
||||
#### query\_by\_embedding
|
||||
|
||||
```python
|
||||
| query_by_embedding(query_emb: List[float], filters: Optional[Dict[str, List[str]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query_emb`: Embedding of the query (e.g. gathered from DPR)
|
||||
- `filters`: Optional filters to narrow down the search space.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
- `top_k`: How many documents to return
|
||||
- `index`: Index name for storing the docs and metadata
|
||||
- `return_embedding`: To return document embedding
|
||||
|
||||
**Returns**:
|
||||
|
||||
|
||||
|
||||
<a name="memory.InMemoryDocumentStore.update_embeddings"></a>
|
||||
#### update\_embeddings
|
||||
|
||||
@ -200,6 +384,50 @@ This can be useful if want to add or change the embeddings for your documents (e
|
||||
|
||||
None
|
||||
|
||||
<a name="memory.InMemoryDocumentStore.get_document_count"></a>
|
||||
#### get\_document\_count
|
||||
|
||||
```python
|
||||
| get_document_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int
|
||||
```
|
||||
|
||||
Return the number of documents in the document store.
|
||||
|
||||
<a name="memory.InMemoryDocumentStore.get_label_count"></a>
|
||||
#### get\_label\_count
|
||||
|
||||
```python
|
||||
| get_label_count(index: Optional[str] = None) -> int
|
||||
```
|
||||
|
||||
Return the number of labels in the document store
|
||||
|
||||
<a name="memory.InMemoryDocumentStore.get_all_documents"></a>
|
||||
#### get\_all\_documents
|
||||
|
||||
```python
|
||||
| get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Get documents from the document store.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
- `filters`: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
- `return_embedding`: Whether to return the document embeddings.
|
||||
|
||||
<a name="memory.InMemoryDocumentStore.get_all_labels"></a>
|
||||
#### get\_all\_labels
|
||||
|
||||
```python
|
||||
| get_all_labels(index: str = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]
|
||||
```
|
||||
|
||||
Return all labels in the document store
|
||||
|
||||
<a name="memory.InMemoryDocumentStore.add_eval_data"></a>
|
||||
#### add\_eval\_data
|
||||
|
||||
@ -236,143 +464,6 @@ Delete documents in an index. All documents are deleted if no filters are passed
|
||||
|
||||
None
|
||||
|
||||
<a name="sql"></a>
|
||||
# Module sql
|
||||
|
||||
<a name="sql.SQLDocumentStore"></a>
|
||||
## SQLDocumentStore Objects
|
||||
|
||||
```python
|
||||
class SQLDocumentStore(BaseDocumentStore)
|
||||
```
|
||||
|
||||
<a name="sql.SQLDocumentStore.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(url: str = "sqlite://", index: str = "document", label_index: str = "label", update_existing_documents: bool = False)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `url`: URL for SQL database as expected by SQLAlchemy. More info here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls
|
||||
- `index`: The documents are scoped to an index attribute that can be used when writing, querying, or deleting documents.
|
||||
This parameter sets the default value for document index.
|
||||
- `label_index`: The default value of index attribute for the labels.
|
||||
- `update_existing_documents`: Whether to update any existing documents with the same ID when adding
|
||||
documents. When set as True, any document with an existing ID gets updated.
|
||||
If set to False, an error is raised if the document ID of the document being
|
||||
added already exists. Using this parameter coud cause performance degradation for document insertion.
|
||||
|
||||
<a name="sql.SQLDocumentStore.write_documents"></a>
|
||||
#### write\_documents
|
||||
|
||||
```python
|
||||
| write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None)
|
||||
```
|
||||
|
||||
Indexes documents for later queries.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `documents`: a list of Python dictionaries or a list of Haystack Document objects.
|
||||
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
|
||||
Optionally: Include meta data via {"text": "<the-actual-text>",
|
||||
"meta": {"name": "<some-document-name>, "author": "somebody", ...}}
|
||||
It can be used for filtering and is accessible in the responses of the Finder.
|
||||
- `index`: add an optional index attribute to documents. It can be later used for filtering. For instance,
|
||||
documents for evaluation can be indexed in a separate index than the documents for search.
|
||||
|
||||
**Returns**:
|
||||
|
||||
None
|
||||
|
||||
<a name="sql.SQLDocumentStore.update_vector_ids"></a>
|
||||
#### update\_vector\_ids
|
||||
|
||||
```python
|
||||
| update_vector_ids(vector_id_map: Dict[str, str], index: Optional[str] = None)
|
||||
```
|
||||
|
||||
Update vector_ids for given document_ids.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `vector_id_map`: dict containing mapping of document_id -> vector_id.
|
||||
- `index`: filter documents by the optional index attribute for documents in database.
|
||||
|
||||
<a name="sql.SQLDocumentStore.add_eval_data"></a>
|
||||
#### add\_eval\_data
|
||||
|
||||
```python
|
||||
| add_eval_data(filename: str, doc_index: str = "eval_document", label_index: str = "label")
|
||||
```
|
||||
|
||||
Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `filename`: Name of the file containing evaluation data
|
||||
:type filename: str
|
||||
- `doc_index`: Elasticsearch index where evaluation documents should be stored
|
||||
:type doc_index: str
|
||||
- `label_index`: Elasticsearch index where labeled questions should be stored
|
||||
:type label_index: str
|
||||
|
||||
<a name="sql.SQLDocumentStore.delete_all_documents"></a>
|
||||
#### delete\_all\_documents
|
||||
|
||||
```python
|
||||
| delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
|
||||
```
|
||||
|
||||
Delete documents in an index. All documents are deleted if no filters are passed.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Index name to delete the document from.
|
||||
- `filters`: Optional filters to narrow down the documents to be deleted.
|
||||
|
||||
**Returns**:
|
||||
|
||||
None
|
||||
|
||||
<a name="base"></a>
|
||||
# Module base
|
||||
|
||||
<a name="base.BaseDocumentStore"></a>
|
||||
## BaseDocumentStore Objects
|
||||
|
||||
```python
|
||||
class BaseDocumentStore(ABC)
|
||||
```
|
||||
|
||||
Base class for implementing Document Stores.
|
||||
|
||||
<a name="base.BaseDocumentStore.write_documents"></a>
|
||||
#### write\_documents
|
||||
|
||||
```python
|
||||
| @abstractmethod
|
||||
| write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None)
|
||||
```
|
||||
|
||||
Indexes documents for later queries.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `documents`: a list of Python dictionaries or a list of Haystack Document objects.
|
||||
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
|
||||
Optionally: Include meta data via {"text": "<the-actual-text>",
|
||||
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
|
||||
It can be used for filtering and is accessible in the responses of the Finder.
|
||||
- `index`: Optional name of index where the documents shall be written to.
|
||||
If None, the DocumentStore's default index (self.index) will be used.
|
||||
|
||||
**Returns**:
|
||||
|
||||
None
|
||||
|
||||
<a name="faiss"></a>
|
||||
# Module faiss
|
||||
|
||||
@ -395,7 +486,7 @@ the vector embeddings are indexed in a FAISS Index.
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(sql_url: str = "sqlite:///", index_buffer_size: int = 10_000, vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: Optional[bool] = True, update_existing_documents: bool = False, index: str = "document", **kwargs, ,)
|
||||
| __init__(sql_url: str = "sqlite:///", index_buffer_size: int = 10_000, vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, update_existing_documents: bool = False, index: str = "document", **kwargs, ,)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -467,6 +558,23 @@ This can be useful if want to add or change the embeddings for your documents (e
|
||||
|
||||
None
|
||||
|
||||
<a name="faiss.FAISSDocumentStore.get_all_documents"></a>
|
||||
#### get\_all\_documents
|
||||
|
||||
```python
|
||||
| get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Get documents from the document store.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
- `filters`: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
- `return_embedding`: Whether to return the document embeddings.
|
||||
|
||||
<a name="faiss.FAISSDocumentStore.train_index"></a>
|
||||
#### train\_index
|
||||
|
||||
@ -487,6 +595,15 @@ You can pass either documents (incl. embeddings) or just the plain embeddings th
|
||||
|
||||
None
|
||||
|
||||
<a name="faiss.FAISSDocumentStore.delete_all_documents"></a>
|
||||
#### delete\_all\_documents
|
||||
|
||||
```python
|
||||
| delete_all_documents(index=None)
|
||||
```
|
||||
|
||||
Delete all documents from the document store.
|
||||
|
||||
<a name="faiss.FAISSDocumentStore.query_by_embedding"></a>
|
||||
#### query\_by\_embedding
|
||||
|
||||
@ -549,3 +666,247 @@ smaller chunks to reduce memory footprint.
|
||||
|
||||
|
||||
|
||||
<a name="sql"></a>
|
||||
# Module sql
|
||||
|
||||
<a name="sql.SQLDocumentStore"></a>
|
||||
## SQLDocumentStore Objects
|
||||
|
||||
```python
|
||||
class SQLDocumentStore(BaseDocumentStore)
|
||||
```
|
||||
|
||||
<a name="sql.SQLDocumentStore.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(url: str = "sqlite://", index: str = "document", label_index: str = "label", update_existing_documents: bool = False)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `url`: URL for SQL database as expected by SQLAlchemy. More info here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls
|
||||
- `index`: The documents are scoped to an index attribute that can be used when writing, querying, or deleting documents.
|
||||
This parameter sets the default value for document index.
|
||||
- `label_index`: The default value of index attribute for the labels.
|
||||
- `update_existing_documents`: Whether to update any existing documents with the same ID when adding
|
||||
documents. When set as True, any document with an existing ID gets updated.
|
||||
If set to False, an error is raised if the document ID of the document being
|
||||
added already exists. Using this parameter coud cause performance degradation for document insertion.
|
||||
|
||||
<a name="sql.SQLDocumentStore.get_document_by_id"></a>
|
||||
#### get\_document\_by\_id
|
||||
|
||||
```python
|
||||
| get_document_by_id(id: str, index: Optional[str] = None) -> Optional[Document]
|
||||
```
|
||||
|
||||
Fetch a document by specifying its text id string
|
||||
|
||||
<a name="sql.SQLDocumentStore.get_documents_by_id"></a>
|
||||
#### get\_documents\_by\_id
|
||||
|
||||
```python
|
||||
| get_documents_by_id(ids: List[str], index: Optional[str] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Fetch documents by specifying a list of text id strings
|
||||
|
||||
<a name="sql.SQLDocumentStore.get_documents_by_vector_ids"></a>
|
||||
#### get\_documents\_by\_vector\_ids
|
||||
|
||||
```python
|
||||
| get_documents_by_vector_ids(vector_ids: List[str], index: Optional[str] = None)
|
||||
```
|
||||
|
||||
Fetch documents by specifying a list of text vector id strings
|
||||
|
||||
<a name="sql.SQLDocumentStore.get_all_documents"></a>
|
||||
#### get\_all\_documents
|
||||
|
||||
```python
|
||||
| get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Get documents from the document store.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
- `filters`: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
- `return_embedding`: Whether to return the document embeddings.
|
||||
|
||||
<a name="sql.SQLDocumentStore.get_all_labels"></a>
|
||||
#### get\_all\_labels
|
||||
|
||||
```python
|
||||
| get_all_labels(index=None, filters: Optional[dict] = None)
|
||||
```
|
||||
|
||||
Return all labels in the document store
|
||||
|
||||
<a name="sql.SQLDocumentStore.write_documents"></a>
|
||||
#### write\_documents
|
||||
|
||||
```python
|
||||
| write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None)
|
||||
```
|
||||
|
||||
Indexes documents for later queries.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `documents`: a list of Python dictionaries or a list of Haystack Document objects.
|
||||
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
|
||||
Optionally: Include meta data via {"text": "<the-actual-text>",
|
||||
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
|
||||
It can be used for filtering and is accessible in the responses of the Finder.
|
||||
- `index`: add an optional index attribute to documents. It can be later used for filtering. For instance,
|
||||
documents for evaluation can be indexed in a separate index than the documents for search.
|
||||
|
||||
**Returns**:
|
||||
|
||||
None
|
||||
|
||||
<a name="sql.SQLDocumentStore.write_labels"></a>
|
||||
#### write\_labels
|
||||
|
||||
```python
|
||||
| write_labels(labels, index=None)
|
||||
```
|
||||
|
||||
Write annotation labels into document store.
|
||||
|
||||
<a name="sql.SQLDocumentStore.update_vector_ids"></a>
|
||||
#### update\_vector\_ids
|
||||
|
||||
```python
|
||||
| update_vector_ids(vector_id_map: Dict[str, str], index: Optional[str] = None)
|
||||
```
|
||||
|
||||
Update vector_ids for given document_ids.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `vector_id_map`: dict containing mapping of document_id -> vector_id.
|
||||
- `index`: filter documents by the optional index attribute for documents in database.
|
||||
|
||||
<a name="sql.SQLDocumentStore.update_document_meta"></a>
|
||||
#### update\_document\_meta
|
||||
|
||||
```python
|
||||
| update_document_meta(id: str, meta: Dict[str, str])
|
||||
```
|
||||
|
||||
Update the metadata dictionary of a document by specifying its string id
|
||||
|
||||
<a name="sql.SQLDocumentStore.add_eval_data"></a>
|
||||
#### add\_eval\_data
|
||||
|
||||
```python
|
||||
| add_eval_data(filename: str, doc_index: str = "eval_document", label_index: str = "label")
|
||||
```
|
||||
|
||||
Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `filename`: Name of the file containing evaluation data
|
||||
:type filename: str
|
||||
- `doc_index`: Elasticsearch index where evaluation documents should be stored
|
||||
:type doc_index: str
|
||||
- `label_index`: Elasticsearch index where labeled questions should be stored
|
||||
:type label_index: str
|
||||
|
||||
<a name="sql.SQLDocumentStore.get_document_count"></a>
|
||||
#### get\_document\_count
|
||||
|
||||
```python
|
||||
| get_document_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int
|
||||
```
|
||||
|
||||
Return the number of documents in the document store.
|
||||
|
||||
<a name="sql.SQLDocumentStore.get_label_count"></a>
|
||||
#### get\_label\_count
|
||||
|
||||
```python
|
||||
| get_label_count(index: Optional[str] = None) -> int
|
||||
```
|
||||
|
||||
Return the number of labels in the document store
|
||||
|
||||
<a name="sql.SQLDocumentStore.delete_all_documents"></a>
|
||||
#### delete\_all\_documents
|
||||
|
||||
```python
|
||||
| delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None)
|
||||
```
|
||||
|
||||
Delete documents in an index. All documents are deleted if no filters are passed.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Index name to delete the document from.
|
||||
- `filters`: Optional filters to narrow down the documents to be deleted.
|
||||
|
||||
**Returns**:
|
||||
|
||||
None
|
||||
|
||||
<a name="base"></a>
|
||||
# Module base
|
||||
|
||||
<a name="base.BaseDocumentStore"></a>
|
||||
## BaseDocumentStore Objects
|
||||
|
||||
```python
|
||||
class BaseDocumentStore(ABC)
|
||||
```
|
||||
|
||||
Base class for implementing Document Stores.
|
||||
|
||||
<a name="base.BaseDocumentStore.write_documents"></a>
|
||||
#### write\_documents
|
||||
|
||||
```python
|
||||
| @abstractmethod
|
||||
| write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None)
|
||||
```
|
||||
|
||||
Indexes documents for later queries.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `documents`: a list of Python dictionaries or a list of Haystack Document objects.
|
||||
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
|
||||
Optionally: Include meta data via {"text": "<the-actual-text>",
|
||||
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
|
||||
It can be used for filtering and is accessible in the responses of the Finder.
|
||||
- `index`: Optional name of index where the documents shall be written to.
|
||||
If None, the DocumentStore's default index (self.index) will be used.
|
||||
|
||||
**Returns**:
|
||||
|
||||
None
|
||||
|
||||
<a name="base.BaseDocumentStore.get_all_documents"></a>
|
||||
#### get\_all\_documents
|
||||
|
||||
```python
|
||||
| @abstractmethod
|
||||
| get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, return_embedding: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Get documents from the document store.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `index`: Name of the index to get the documents from. If None, the
|
||||
DocumentStore's default index (self.index) will be used.
|
||||
- `filters`: Optional filters to narrow down the documents to return.
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
- `return_embedding`: Whether to return the document embeddings.
|
||||
|
||||
|
||||
@ -1,87 +1,3 @@
|
||||
<a name="txt"></a>
|
||||
# Module txt
|
||||
|
||||
<a name="txt.TextConverter"></a>
|
||||
## TextConverter Objects
|
||||
|
||||
```python
|
||||
class BaseConverter()
|
||||
```
|
||||
|
||||
Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
|
||||
|
||||
<a name="base.BaseConverter.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
|
||||
<a name="base.BaseConverter.convert"></a>
|
||||
#### convert
|
||||
|
||||
```python
|
||||
| @abstractmethod
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
|
||||
File converters may extract file meta like name or size. In addition to it, user
|
||||
supplied meta data like author, url, external IDs can be supplied as a dictionary.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `file_path`: path of the file to convert
|
||||
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
|
||||
|
||||
<a name="base.BaseConverter.validate_language"></a>
|
||||
#### validate\_language
|
||||
|
||||
```python
|
||||
| validate_language(text: str) -> bool
|
||||
```
|
||||
|
||||
Validate if the language of the text is one of valid languages.
|
||||
|
||||
<a name="docx"></a>
|
||||
# docx
|
||||
|
||||
<a name="docx.DocxToTextConverter"></a>
|
||||
## DocxToTextConverter
|
||||
|
||||
```python
|
||||
class DocxToTextConverter(BaseConverter)
|
||||
```
|
||||
|
||||
<a name="docx.DocxToTextConverter.convert"></a>
|
||||
#### convert
|
||||
|
||||
```python
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
Extract text from a .docx file.
|
||||
Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
|
||||
For compliance with other converters we nevertheless opted for keeping the methods name.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `file_path`: Path to the .docx file you want to convert
|
||||
|
||||
<a name="docx"></a>
|
||||
# Module docx
|
||||
|
||||
@ -153,11 +69,11 @@ in garbled text.
|
||||
|
||||
a list of pages and the extracted meta data of the file.
|
||||
|
||||
<a name="base"></a>
|
||||
# Module base
|
||||
<a name="txt"></a>
|
||||
# Module txt
|
||||
|
||||
<a name="base.BaseConverter"></a>
|
||||
## BaseConverter Objects
|
||||
<a name="txt.TextConverter"></a>
|
||||
## TextConverter Objects
|
||||
|
||||
```python
|
||||
class TextConverter(BaseConverter)
|
||||
@ -232,3 +148,75 @@ This option can be used to add test for encoding errors. If the extracted text i
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
|
||||
<a name="pdf.PDFToTextConverter.convert"></a>
|
||||
#### convert
|
||||
|
||||
```python
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]] = None) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
Extract text from a .pdf file.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `file_path`: Path to the .pdf file you want to convert
|
||||
|
||||
<a name="base"></a>
|
||||
# Module base
|
||||
|
||||
<a name="base.BaseConverter"></a>
|
||||
## BaseConverter Objects
|
||||
|
||||
```python
|
||||
class BaseConverter()
|
||||
```
|
||||
|
||||
Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
|
||||
|
||||
<a name="base.BaseConverter.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
|
||||
<a name="base.BaseConverter.convert"></a>
|
||||
#### convert
|
||||
|
||||
```python
|
||||
| @abstractmethod
|
||||
| convert(file_path: Path, meta: Optional[Dict[str, str]]) -> Dict[str, Any]
|
||||
```
|
||||
|
||||
Convert a file to a dictionary containing the text and any associated meta data.
|
||||
|
||||
File converters may extract file meta like name or size. In addition to it, user
|
||||
supplied meta data like author, url, external IDs can be supplied as a dictionary.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `file_path`: path of the file to convert
|
||||
- `meta`: dictionary of meta data key-value pairs to append in the returned document.
|
||||
|
||||
<a name="base.BaseConverter.validate_language"></a>
|
||||
#### validate\_language
|
||||
|
||||
```python
|
||||
| validate_language(text: str) -> bool
|
||||
```
|
||||
|
||||
Validate if the language of the text is one of valid languages.
|
||||
|
||||
|
||||
@ -22,27 +22,30 @@ i.e. the model can easily adjust to domain documents even after training has fin
|
||||
**Example**
|
||||
|
||||
```python
|
||||
> question = "who got the first nobel prize in physics?"
|
||||
|
||||
# Retrieve related documents from retriever
|
||||
> retrieved_docs = retriever.retrieve(query=question)
|
||||
|
||||
> # Now generate answer from question and retrieved documents
|
||||
> generator.predict(
|
||||
> question=question,
|
||||
> documents=retrieved_docs,
|
||||
> top_k=1
|
||||
> )
|
||||
{'question': 'who got the first nobel prize in physics',
|
||||
'answers':
|
||||
[{'question': 'who got the first nobel prize in physics',
|
||||
'answer': ' albert einstein',
|
||||
'meta': { 'doc_ids': [...],
|
||||
'doc_scores': [80.42758 ...],
|
||||
'doc_probabilities': [40.71379089355469, ...
|
||||
'texts': ['Albert Einstein was a ...]
|
||||
'titles': ['"Albert Einstein"', ...]
|
||||
}}]}
|
||||
| question = "who got the first nobel prize in physics?"
|
||||
|
|
||||
| # Retrieve related documents from retriever
|
||||
| retrieved_docs = retriever.retrieve(query=question)
|
||||
|
|
||||
| # Now generate answer from question and retrieved documents
|
||||
| generator.predict(
|
||||
| question=question,
|
||||
| documents=retrieved_docs,
|
||||
| top_k=1
|
||||
| )
|
||||
|
|
||||
| # Answer
|
||||
|
|
||||
| {'question': 'who got the first nobel prize in physics',
|
||||
| 'answers':
|
||||
| [{'question': 'who got the first nobel prize in physics',
|
||||
| 'answer': ' albert einstein',
|
||||
| 'meta': { 'doc_ids': [...],
|
||||
| 'doc_scores': [80.42758 ...],
|
||||
| 'doc_probabilities': [40.71379089355469, ...
|
||||
| 'texts': ['Albert Einstein was a ...]
|
||||
| 'titles': ['"Albert Einstein"', ...]
|
||||
| }}]}
|
||||
```
|
||||
|
||||
<a name="transformers.RAGenerator.__init__"></a>
|
||||
@ -91,16 +94,16 @@ These document can for example be retrieved via the Retriever.
|
||||
Generated answers plus additional infos in a dict like this:
|
||||
|
||||
```python
|
||||
> {'question': 'who got the first nobel prize in physics',
|
||||
> 'answers':
|
||||
> [{'question': 'who got the first nobel prize in physics',
|
||||
> 'answer': ' albert einstein',
|
||||
> 'meta': { 'doc_ids': [...],
|
||||
> 'doc_scores': [80.42758 ...],
|
||||
> 'doc_probabilities': [40.71379089355469, ...
|
||||
> 'texts': ['Albert Einstein was a ...]
|
||||
> 'titles': ['"Albert Einstein"', ...]
|
||||
> }}]}
|
||||
| {'question': 'who got the first nobel prize in physics',
|
||||
| 'answers':
|
||||
| [{'question': 'who got the first nobel prize in physics',
|
||||
| 'answer': ' albert einstein',
|
||||
| 'meta': { 'doc_ids': [...],
|
||||
| 'doc_scores': [80.42758 ...],
|
||||
| 'doc_probabilities': [40.71379089355469, ...
|
||||
| 'texts': ['Albert Einstein was a ...]
|
||||
| 'titles': ['"Albert Einstein"', ...]
|
||||
| }}]}
|
||||
```
|
||||
|
||||
<a name="base"></a>
|
||||
|
||||
@ -34,8 +34,26 @@ Set the value to None to disable striding behaviour.
|
||||
to True, the individual split will always have complete sentences &
|
||||
the number of words will be <= split_length.
|
||||
|
||||
<a name="cleaning"></a>
|
||||
# Module cleaning
|
||||
<a name="preprocessor.PreProcessor.clean"></a>
|
||||
#### clean
|
||||
|
||||
```python
|
||||
| clean(document: dict) -> dict
|
||||
```
|
||||
|
||||
Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers
|
||||
and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__().
|
||||
|
||||
<a name="preprocessor.PreProcessor.split"></a>
|
||||
#### split
|
||||
|
||||
```python
|
||||
| split(document: dict) -> List[dict]
|
||||
```
|
||||
|
||||
Perform document splitting on a single document. This method can split on different units, at different lengths,
|
||||
with different strides. It can also respect sectence boundaries. Its exact functionality is defined by
|
||||
the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents.
|
||||
|
||||
<a name="utils"></a>
|
||||
# Module utils
|
||||
@ -126,3 +144,32 @@ bool if anything got fetched
|
||||
<a name="base"></a>
|
||||
# Module base
|
||||
|
||||
<a name="base.BasePreProcessor"></a>
|
||||
## BasePreProcessor Objects
|
||||
|
||||
```python
|
||||
class BasePreProcessor()
|
||||
```
|
||||
|
||||
<a name="base.BasePreProcessor.process"></a>
|
||||
#### process
|
||||
|
||||
```python
|
||||
| process(document: dict) -> List[dict]
|
||||
```
|
||||
|
||||
Perform document cleaning and splitting. Takes a single document as input and returns a list of documents.
|
||||
|
||||
<a name="cleaning"></a>
|
||||
# Module cleaning
|
||||
|
||||
<a name="cleaning.clean_wiki_text"></a>
|
||||
#### clean\_wiki\_text
|
||||
|
||||
```python
|
||||
clean_wiki_text(text: str) -> str
|
||||
```
|
||||
|
||||
Clean wikipedia text by removing multiple new lines, removing extremely short lines,
|
||||
adding paragraph breaks and removing empty paragraphs
|
||||
|
||||
|
||||
@ -1,5 +1,90 @@
|
||||
<a name="base"></a>
|
||||
# base
|
||||
<a name="transformers"></a>
|
||||
# Module transformers
|
||||
|
||||
<a name="transformers.TransformersReader"></a>
|
||||
## TransformersReader Objects
|
||||
|
||||
```python
|
||||
class TransformersReader(BaseReader)
|
||||
```
|
||||
|
||||
Transformer based model for extractive Question Answering using the HuggingFace's transformers framework
|
||||
(https://github.com/huggingface/transformers).
|
||||
While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same.
|
||||
With this reader, you can directly get predictions via predict()
|
||||
|
||||
<a name="transformers.TransformersReader.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: int = 0, top_k_per_candidate: int = 4, return_no_answers: bool = True, max_seq_len: int = 256, doc_stride: int = 128)
|
||||
```
|
||||
|
||||
Load a QA model from Transformers.
|
||||
Available models include:
|
||||
|
||||
- ``'distilbert-base-uncased-distilled-squad`'``
|
||||
- ``'bert-large-cased-whole-word-masking-finetuned-squad``'
|
||||
- ``'bert-large-uncased-whole-word-masking-finetuned-squad``'
|
||||
|
||||
See https://huggingface.co/models for full list of available QA models
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bert-base-cased',
|
||||
'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'.
|
||||
See https://huggingface.co/models for full list of available models.
|
||||
- `tokenizer`: Name of the tokenizer (usually the same as model)
|
||||
- `context_window_size`: Num of chars (before and after the answer) to return as "context" for each answer.
|
||||
The context usually helps users to understand if the answer really makes sense.
|
||||
- `use_gpu`: If < 0, then use cpu. If >= 0, this is the ordinal of the gpu to use
|
||||
- `top_k_per_candidate`: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text).
|
||||
Note that this is not the number of "final answers" you will receive
|
||||
(see `top_k` in TransformersReader.predict() or Finder.get_answers() for that)
|
||||
and that no_answer can be included in the sorted list of predictions.
|
||||
- `return_no_answers`: If True, the HuggingFace Transformers model could return a "no_answer" (i.e. when there is an unanswerable question)
|
||||
If False, it cannot return a "no_answer". Note that `no_answer_boost` is unfortunately not available with TransformersReader.
|
||||
If you would like to set no_answer_boost, use a `FARMReader`.
|
||||
- `max_seq_len`: max sequence length of one input text for the model
|
||||
- `doc_stride`: length of striding window for splitting long texts (used if len(text) > max_seq_len)
|
||||
|
||||
<a name="transformers.TransformersReader.predict"></a>
|
||||
#### predict
|
||||
|
||||
```python
|
||||
| predict(question: str, documents: List[Document], top_k: Optional[int] = None)
|
||||
```
|
||||
|
||||
Use loaded QA model to find answers for a question in the supplied list of Document.
|
||||
|
||||
Returns dictionaries containing answers sorted by (desc.) probability.
|
||||
Example:
|
||||
|
||||
```python
|
||||
|{
|
||||
| 'question': 'Who is the father of Arya Stark?',
|
||||
| 'answers':[
|
||||
| {'answer': 'Eddard,',
|
||||
| 'context': " She travels with her father, Eddard, to King's Landing when he is ",
|
||||
| 'offset_answer_start': 147,
|
||||
| 'offset_answer_end': 154,
|
||||
| 'probability': 0.9787139466668613,
|
||||
| 'score': None,
|
||||
| 'document_id': '1337'
|
||||
| },...
|
||||
| ]
|
||||
|}
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `question`: Question string
|
||||
- `documents`: List of Document in which to search for the answer
|
||||
- `top_k`: The maximum number of answers to return
|
||||
|
||||
**Returns**:
|
||||
|
||||
Dict containing question and answers
|
||||
|
||||
<a name="farm"></a>
|
||||
# Module farm
|
||||
@ -283,97 +368,6 @@ float32 could still be be more performant.
|
||||
- `task_type`: Type of task for the model. Available options: "question_answering" or "embeddings".
|
||||
- `opset_version`: ONNX opset version
|
||||
|
||||
<a name="transformers"></a>
|
||||
# Module transformers
|
||||
|
||||
<a name="transformers.TransformersReader"></a>
|
||||
## TransformersReader Objects
|
||||
|
||||
```python
|
||||
class TransformersReader(BaseReader)
|
||||
```
|
||||
|
||||
Transformer based model for extractive Question Answering using the HuggingFace's transformers framework
|
||||
(https://github.com/huggingface/transformers).
|
||||
While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same.
|
||||
|
||||
| With the reader, you can:
|
||||
|
||||
- directly get predictions via predict()
|
||||
|
||||
<a name="transformers.TransformersReader.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: int = 0, top_k_per_candidate: int = 4, return_no_answers: bool = True, max_seq_len: int = 256, doc_stride: int = 128)
|
||||
```
|
||||
|
||||
Load a QA model from Transformers.
|
||||
Available models include:
|
||||
|
||||
- ``'distilbert-base-uncased-distilled-squad`'``
|
||||
- ``'bert-large-cased-whole-word-masking-finetuned-squad``'
|
||||
- ``'bert-large-uncased-whole-word-masking-finetuned-squad``'
|
||||
|
||||
See https://huggingface.co/models for full list of available QA models
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bert-base-cased',
|
||||
'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'.
|
||||
See https://huggingface.co/models for full list of available models.
|
||||
- `tokenizer`: Name of the tokenizer (usually the same as model)
|
||||
- `context_window_size`: Num of chars (before and after the answer) to return as "context" for each answer.
|
||||
The context usually helps users to understand if the answer really makes sense.
|
||||
- `use_gpu`: If < 0, then use cpu. If >= 0, this is the ordinal of the gpu to use
|
||||
- `top_k_per_candidate`: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text).
|
||||
Note that this is not the number of "final answers" you will receive
|
||||
(see `top_k` in TransformersReader.predict() or Finder.get_answers() for that)
|
||||
and that no_answer can be included in the sorted list of predictions.
|
||||
- `return_no_answers`: If True, the HuggingFace Transformers model could return a "no_answer" (i.e. when there is an unanswerable question)
|
||||
If False, it cannot return a "no_answer". Note that `no_answer_boost` is unfortunately not available with TransformersReader.
|
||||
If you would like to set no_answer_boost, use a `FARMReader`.
|
||||
- `max_seq_len`: max sequence length of one input text for the model
|
||||
- `doc_stride`: length of striding window for splitting long texts (used if len(text) > max_seq_len)
|
||||
|
||||
<a name="transformers.TransformersReader.predict"></a>
|
||||
#### predict
|
||||
|
||||
```python
|
||||
| predict(question: str, documents: List[Document], top_k: Optional[int] = None)
|
||||
```
|
||||
|
||||
Use loaded QA model to find answers for a question in the supplied list of Document.
|
||||
|
||||
Returns dictionaries containing answers sorted by (desc.) probability.
|
||||
Example:
|
||||
|
||||
```python
|
||||
|{
|
||||
| 'question': 'Who is the father of Arya Stark?',
|
||||
| 'answers':[
|
||||
| {'answer': 'Eddard,',
|
||||
| 'context': " She travels with her father, Eddard, to King's Landing when he is ",
|
||||
| 'offset_answer_start': 147,
|
||||
| 'offset_answer_end': 154,
|
||||
| 'probability': 0.9787139466668613,
|
||||
| 'score': None,
|
||||
| 'document_id': '1337'
|
||||
| },...
|
||||
| ]
|
||||
|}
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `question`: Question string
|
||||
- `documents`: List of Document in which to search for the answer
|
||||
- `top_k`: The maximum number of answers to return
|
||||
|
||||
**Returns**:
|
||||
|
||||
Dict containing question and answers
|
||||
|
||||
<a name="base"></a>
|
||||
# Module base
|
||||
|
||||
|
||||
@ -1,142 +1,3 @@
|
||||
<a name="base"></a>
|
||||
# base
|
||||
|
||||
<a name="base.BaseRetriever"></a>
|
||||
## BaseRetriever
|
||||
|
||||
```python
|
||||
class BaseRetriever(ABC)
|
||||
```
|
||||
|
||||
<a name="base.BaseRetriever.retrieve"></a>
|
||||
#### retrieve
|
||||
|
||||
```python
|
||||
| @abstractmethod
|
||||
| retrieve(query: str, filters: dict = None, top_k: int = 10, index: str = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
that are most relevant to the query.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: The query
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="base.BaseRetriever.eval"></a>
|
||||
#### eval
|
||||
|
||||
```python
|
||||
| eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False) -> dict
|
||||
```
|
||||
|
||||
Performs evaluation on the Retriever.
|
||||
Retriever is evaluated based on whether it finds the correct document given the question string and at which
|
||||
position in the ranking of documents the correct document is.
|
||||
|
||||
| Returns a dict containing the following metrics:
|
||||
|
||||
- "recall": Proportion of questions for which correct document is among retrieved documents
|
||||
- "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank.
|
||||
Only considers the highest ranked relevant document.
|
||||
- "map": Mean of average precision for each question. Rewards retrievers that give relevant
|
||||
documents a higher rank. Considers all retrieved relevant documents. If ``open_domain=True``,
|
||||
average precision is normalized by the number of retrieved relevant documents per query.
|
||||
If ``open_domain=False``, average precision is normalized by the number of all relevant documents
|
||||
per query.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `label_index`: Index/Table in DocumentStore where labeled questions are stored
|
||||
- `doc_index`: Index/Table in DocumentStore where documents that are used for evaluation are stored
|
||||
- `top_k`: How many documents to return per question
|
||||
- `open_domain`: If ``True``, retrieval will be evaluated by checking if the answer string to a question is
|
||||
contained in the retrieved docs (common approach in open-domain QA).
|
||||
If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids
|
||||
are within ids explicitly stated in the labels.
|
||||
- `return_preds`: Whether to add predictions in the returned dictionary. If True, the returned dictionary
|
||||
contains the keys "predictions" and "metrics".
|
||||
|
||||
<a name="sparse"></a>
|
||||
# Module sparse
|
||||
|
||||
<a name="sparse.ElasticsearchRetriever"></a>
|
||||
## ElasticsearchRetriever Objects
|
||||
|
||||
```python
|
||||
class ElasticsearchRetriever(BaseRetriever)
|
||||
```
|
||||
|
||||
<a name="sparse.ElasticsearchRetriever.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(document_store: ElasticsearchDocumentStore, custom_query: str = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `document_store`: an instance of a DocumentStore to retrieve documents from.
|
||||
- `custom_query`: query string as per Elasticsearch DSL with a mandatory question placeholder($question).
|
||||
|
||||
Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
|
||||
that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
|
||||
names must match with the filters dict supplied in self.retrieve().
|
||||
::
|
||||
|
||||
**An example custom_query:**
|
||||
```python
|
||||
| {
|
||||
| "size": 10,
|
||||
| "query": {
|
||||
| "bool": {
|
||||
| "should": [{"multi_match": {
|
||||
| "query": "${question}", // mandatory $question placeholder
|
||||
| "type": "most_fields",
|
||||
| "fields": ["text", "title"]}}],
|
||||
| "filter": [ // optional custom filters
|
||||
| {"terms": {"year": "${years}"}},
|
||||
| {"terms": {"quarter": "${quarters}"}},
|
||||
| {"range": {"date": {"gte": "${date}"}}}
|
||||
| ],
|
||||
| }
|
||||
| },
|
||||
| }
|
||||
```
|
||||
|
||||
**For this custom_query, a sample retrieve() could be:**
|
||||
```python
|
||||
| self.retrieve(query="Why did the revenue increase?",
|
||||
| filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
|
||||
```
|
||||
|
||||
<a name="sparse.ElasticsearchFilterOnlyRetriever"></a>
|
||||
## ElasticsearchFilterOnlyRetriever Objects
|
||||
|
||||
```python
|
||||
class ElasticsearchFilterOnlyRetriever(ElasticsearchRetriever)
|
||||
```
|
||||
|
||||
Naive "Retriever" that returns all documents that match the given filters. No impact of query at all.
|
||||
Helpful for benchmarking, testing and if you want to do QA on small documents without an "active" retriever.
|
||||
|
||||
<a name="sparse.TfidfRetriever"></a>
|
||||
## TfidfRetriever Objects
|
||||
|
||||
```python
|
||||
class TfidfRetriever(BaseRetriever)
|
||||
```
|
||||
|
||||
Read all documents from a SQL backend.
|
||||
|
||||
Split documents into smaller units (eg, paragraphs or pages) to reduce the
|
||||
computations when text is passed on to a Reader for QA.
|
||||
|
||||
It uses sklearn's TfidfVectorizer to compute a tf-idf matrix.
|
||||
|
||||
<a name="dense"></a>
|
||||
# Module dense
|
||||
|
||||
@ -195,6 +56,23 @@ The title is expected to be present in doc.meta["name"] and can be supplied in t
|
||||
before writing them to the DocumentStore like this:
|
||||
{"text": "my text", "meta": {"name": "my title"}}.
|
||||
|
||||
<a name="dense.DensePassageRetriever.retrieve"></a>
|
||||
#### retrieve
|
||||
|
||||
```python
|
||||
| retrieve(query: str, filters: dict = None, top_k: int = 10, index: str = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
that are most relevant to the query.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: The query
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="dense.DensePassageRetriever.embed_queries"></a>
|
||||
#### embed\_queries
|
||||
|
||||
@ -262,6 +140,33 @@ train a DensePassageRetrieval model
|
||||
- `query_encoder_save_dir`: directory inside save_dir where query_encoder model files are saved
|
||||
- `passage_encoder_save_dir`: directory inside save_dir where passage_encoder model files are saved
|
||||
|
||||
<a name="dense.DensePassageRetriever.save"></a>
|
||||
#### save
|
||||
|
||||
```python
|
||||
| save(save_dir: Union[Path, str])
|
||||
```
|
||||
|
||||
Save DensePassageRetriever to the specified directory.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `save_dir`: Directory to save to.
|
||||
|
||||
**Returns**:
|
||||
|
||||
None
|
||||
|
||||
<a name="dense.DensePassageRetriever.load"></a>
|
||||
#### load
|
||||
|
||||
```python
|
||||
| @classmethod
|
||||
| load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product")
|
||||
```
|
||||
|
||||
Load DensePassageRetriever from the specified directory.
|
||||
|
||||
<a name="dense.EmbeddingRetriever"></a>
|
||||
## EmbeddingRetriever Objects
|
||||
|
||||
@ -296,6 +201,23 @@ Options:
|
||||
- `emb_extraction_layer`: Number of layer from which the embeddings shall be extracted (for farm / transformers models only).
|
||||
Default: -1 (very last layer).
|
||||
|
||||
<a name="dense.EmbeddingRetriever.retrieve"></a>
|
||||
#### retrieve
|
||||
|
||||
```python
|
||||
| retrieve(query: str, filters: dict = None, top_k: int = 10, index: str = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
that are most relevant to the query.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: The query
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="dense.EmbeddingRetriever.embed"></a>
|
||||
#### embed
|
||||
|
||||
@ -347,6 +269,143 @@ Create embeddings for a list of passages. For this Retriever type: The same as c
|
||||
|
||||
Embeddings, one per input passage
|
||||
|
||||
<a name="sparse"></a>
|
||||
# Module sparse
|
||||
|
||||
<a name="sparse.ElasticsearchRetriever"></a>
|
||||
## ElasticsearchRetriever Objects
|
||||
|
||||
```python
|
||||
class ElasticsearchRetriever(BaseRetriever)
|
||||
```
|
||||
|
||||
<a name="sparse.ElasticsearchRetriever.__init__"></a>
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(document_store: ElasticsearchDocumentStore, custom_query: str = None)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `document_store`: an instance of a DocumentStore to retrieve documents from.
|
||||
- `custom_query`: query string as per Elasticsearch DSL with a mandatory question placeholder($question).
|
||||
|
||||
Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
|
||||
that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
|
||||
names must match with the filters dict supplied in self.retrieve().
|
||||
::
|
||||
|
||||
**An example custom_query:**
|
||||
```python
|
||||
| {
|
||||
| "size": 10,
|
||||
| "query": {
|
||||
| "bool": {
|
||||
| "should": [{"multi_match": {
|
||||
| "query": "${question}", // mandatory $question placeholder
|
||||
| "type": "most_fields",
|
||||
| "fields": ["text", "title"]}}],
|
||||
| "filter": [ // optional custom filters
|
||||
| {"terms": {"year": "${years}"}},
|
||||
| {"terms": {"quarter": "${quarters}"}},
|
||||
| {"range": {"date": {"gte": "${date}"}}}
|
||||
| ],
|
||||
| }
|
||||
| },
|
||||
| }
|
||||
```
|
||||
|
||||
**For this custom_query, a sample retrieve() could be:**
|
||||
```python
|
||||
| self.retrieve(query="Why did the revenue increase?",
|
||||
| filters={"years": ["2019"], "quarters": ["Q1", "Q2"]})
|
||||
```
|
||||
|
||||
<a name="sparse.ElasticsearchRetriever.retrieve"></a>
|
||||
#### retrieve
|
||||
|
||||
```python
|
||||
| retrieve(query: str, filters: dict = None, top_k: int = 10, index: str = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
that are most relevant to the query.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: The query
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="sparse.ElasticsearchFilterOnlyRetriever"></a>
|
||||
## ElasticsearchFilterOnlyRetriever Objects
|
||||
|
||||
```python
|
||||
class ElasticsearchFilterOnlyRetriever(ElasticsearchRetriever)
|
||||
```
|
||||
|
||||
Naive "Retriever" that returns all documents that match the given filters. No impact of query at all.
|
||||
Helpful for benchmarking, testing and if you want to do QA on small documents without an "active" retriever.
|
||||
|
||||
<a name="sparse.ElasticsearchFilterOnlyRetriever.retrieve"></a>
|
||||
#### retrieve
|
||||
|
||||
```python
|
||||
| retrieve(query: str, filters: dict = None, top_k: int = 10, index: str = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
that are most relevant to the query.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: The query
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="sparse.TfidfRetriever"></a>
|
||||
## TfidfRetriever Objects
|
||||
|
||||
```python
|
||||
class TfidfRetriever(BaseRetriever)
|
||||
```
|
||||
|
||||
Read all documents from a SQL backend.
|
||||
|
||||
Split documents into smaller units (eg, paragraphs or pages) to reduce the
|
||||
computations when text is passed on to a Reader for QA.
|
||||
|
||||
It uses sklearn's TfidfVectorizer to compute a tf-idf matrix.
|
||||
|
||||
<a name="sparse.TfidfRetriever.retrieve"></a>
|
||||
#### retrieve
|
||||
|
||||
```python
|
||||
| retrieve(query: str, filters: dict = None, top_k: int = 10, index: str = None) -> List[Document]
|
||||
```
|
||||
|
||||
Scan through documents in DocumentStore and return a small number documents
|
||||
that are most relevant to the query.
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `query`: The query
|
||||
- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="sparse.TfidfRetriever.fit"></a>
|
||||
#### fit
|
||||
|
||||
```python
|
||||
| fit()
|
||||
```
|
||||
|
||||
Performing training on this class according to the TF-IDF algorithm.
|
||||
|
||||
<a name="base"></a>
|
||||
# Module base
|
||||
|
||||
@ -375,6 +434,15 @@ that are most relevant to the query.
|
||||
- `top_k`: How many documents to return per query.
|
||||
- `index`: The name of the index in the DocumentStore from which to retrieve documents
|
||||
|
||||
<a name="base.BaseRetriever.timing"></a>
|
||||
#### timing
|
||||
|
||||
```python
|
||||
| timing(fn)
|
||||
```
|
||||
|
||||
Wrapper method used to time functions.
|
||||
|
||||
<a name="base.BaseRetriever.eval"></a>
|
||||
#### eval
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user