mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-27 15:08:43 +00:00
Update Documentation for Haystack 0.5.0 (#557)
* Add languages and preprocessing pages * add content * address review comments * make link relative * update api ref with latest docstrings * move doc readme and update * add generator API docs * fix example code * design and link fix Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> Co-authored-by: PiffPaffM <markuspaff.mp@gmail.com>
This commit is contained in:
parent
f94603cbe4
commit
99e924aede
@ -1,13 +1,19 @@
|
||||
*******************************************************
|
||||
# Haystack — Docstrings Generation
|
||||
*******************************************************
|
||||
# :ledger: Looking for the docs?
|
||||
You find them here here:
|
||||
#### https://haystack.deepset.ai/docs/intromd
|
||||
|
||||
|
||||
# :computer: How to update docs?
|
||||
|
||||
## Usage / Guides etc.
|
||||
|
||||
Will be automatically deployed with every commit to the master branch
|
||||
|
||||
## API Reference
|
||||
|
||||
We use Pydoc-Markdown to create markdown files from the docstrings in our code.
|
||||
|
||||
|
||||
Update docs with all latest docstrings?
|
||||
=======================================
|
||||
### Update docstrings
|
||||
Execute this in `/haystack/docs/_src/api/api`:
|
||||
```
|
||||
pip install 'pydoc-markdown>=3.0.0,<4.0.0'
|
||||
@ -15,22 +21,13 @@ pydoc-markdown pydoc-markdown-document-store.yml
|
||||
pydoc-markdown pydoc-markdown-file-converters.yml
|
||||
pydoc-markdown pydoc-markdown-preprocessor.yml
|
||||
pydoc-markdown pydoc-markdown-reader.yml
|
||||
pydoc-markdown pydoc-markdown-generator.yml
|
||||
pydoc-markdown pydoc-markdown-retriever.yml
|
||||
```
|
||||
|
||||
Update Docstrings of individual modules
|
||||
==========================================
|
||||
(Or run one of the commands above to update the docstrings only for a single module)
|
||||
|
||||
Every .yml file will generate a new markdown file. Run one of the following commands to generate the needed output:
|
||||
|
||||
- **Document store**: `pydoc-markdown pydoc-markdown-document-store.yml`
|
||||
- **File converters**: `pydoc-markdown pydoc-markdown-file-converters.yml`
|
||||
- **Preprocessor**: `pydoc-markdown pydoc-markdown-preprocessor.yml`
|
||||
- **Reader**: `pydoc-markdown pydoc-markdown-reader.yml`
|
||||
- **Retriever**: `pydoc-markdown pydoc-markdown-retriever.yml`
|
||||
|
||||
Configuration
|
||||
============
|
||||
### Configuration
|
||||
|
||||
Pydoc will read the configuration from a `.yml` file which is located in the current working directory. Our files contains three main sections:
|
||||
|
||||
@ -110,7 +110,7 @@ the vector embeddings are indexed in a FAISS Index.
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(sql_url: str = "sqlite:///", index_buffer_size: int = 10_000, vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, **kwargs, ,)
|
||||
| __init__(sql_url: str = "sqlite:///", index_buffer_size: int = 10_000, vector_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: Optional[bool] = True, **kwargs, ,)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
@ -137,6 +137,7 @@ For more details see:
|
||||
Benchmarks: XXX
|
||||
- `faiss_index`: Pass an existing FAISS Index, i.e. an empty one that you configured manually
|
||||
or one with docs that you used in Haystack before and want to load again.
|
||||
- `return_embedding`: To return document embedding
|
||||
|
||||
<a name="faiss.FAISSDocumentStore.write_documents"></a>
|
||||
#### write\_documents
|
||||
@ -200,7 +201,7 @@ None
|
||||
#### query\_by\_embedding
|
||||
|
||||
```python
|
||||
| query_by_embedding(query_emb: np.array, filters: Optional[dict] = None, top_k: int = 10, index: Optional[str] = None) -> List[Document]
|
||||
| query_by_embedding(query_emb: np.array, filters: Optional[dict] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None) -> List[Document]
|
||||
```
|
||||
|
||||
Find the document that is most similar to the provided `query_emb` by using a vector similarity metric.
|
||||
@ -212,6 +213,7 @@ Find the document that is most similar to the provided `query_emb` by using a ve
|
||||
Example: {"name": ["some", "more"], "category": ["only_one"]}
|
||||
- `top_k`: How many documents to return
|
||||
- `index`: (SQL) index name for storing the docs and metadata
|
||||
- `return_embedding`: To return document embedding
|
||||
|
||||
**Returns**:
|
||||
|
||||
@ -271,7 +273,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore)
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(host: str = "localhost", port: int = 9200, username: str = "", password: str = "", index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, scheme: str = "http", ca_certs: bool = False, verify_certs: bool = True, create_index: bool = True, update_existing_documents: bool = False, refresh_type: str = "wait_for", similarity="dot_product", timeout=30)
|
||||
| __init__(host: str = "localhost", port: int = 9200, username: str = "", password: str = "", index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "text", text_field: str = "text", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, faq_question_field: Optional[str] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: bool = False, verify_certs: bool = True, create_index: bool = True, update_existing_documents: bool = False, refresh_type: str = "wait_for", similarity="dot_product", timeout=30, return_embedding: Optional[bool] = True)
|
||||
```
|
||||
|
||||
A DocumentStore using Elasticsearch to store and query the documents for our search.
|
||||
@ -294,6 +296,9 @@ If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will
|
||||
- `embedding_field`: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top)
|
||||
- `embedding_dim`: Dimensionality of embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top)
|
||||
- `custom_mapping`: If you want to use your own custom mapping for creating a new index in Elasticsearch, you can supply it here as a dictionary.
|
||||
- `analyzer`: Specify the default analyzer from one of the built-ins when creating a new Elasticsearch Index.
|
||||
Elasticsearch also has built-in analyzers for different languages (e.g. impacting tokenization). More info at:
|
||||
https://www.elastic.co/guide/en/elasticsearch/reference/7.9/analysis-analyzers.html
|
||||
- `excluded_meta_data`: Name of fields in Elasticsearch that should not be returned (e.g. [field_one, field_two]).
|
||||
Helpful if you have fields with long, irrelevant content that you don't want to display in results (e.g. embedding vectors).
|
||||
- `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance
|
||||
@ -312,6 +317,7 @@ More info at https://www.elastic.co/guide/en/elasticsearch/reference/6.8/docs-re
|
||||
- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default sine it is
|
||||
more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model.
|
||||
- `timeout`: Number of seconds after which an ElasticSearch request times out.
|
||||
- `return_embedding`: To return document embedding
|
||||
|
||||
<a name="elasticsearch.ElasticsearchDocumentStore.write_documents"></a>
|
||||
#### write\_documents
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
#### eval\_data\_from\_file
|
||||
|
||||
```python
|
||||
eval_data_from_file(filename: str) -> Tuple[List[Document], List[Label]]
|
||||
eval_data_from_file(filename: str, max_docs: Union[int, bool] = None) -> Tuple[List[Document], List[Label]]
|
||||
```
|
||||
|
||||
Read Documents + Labels from a SQuAD-style file.
|
||||
@ -14,6 +14,7 @@ Document and Labels can then be indexed to the DocumentStore and be used for eva
|
||||
**Arguments**:
|
||||
|
||||
- `filename`: Path to file in SQuAD format
|
||||
- `max_docs`: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents.
|
||||
|
||||
**Returns**:
|
||||
|
||||
@ -97,27 +98,27 @@ class PreProcessor(BasePreProcessor)
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "passage", split_length: Optional[int] = 10, split_stride: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = False)
|
||||
| __init__(clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_stride: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True)
|
||||
```
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `clean_header_footer`: use heuristic to remove footers and headers across different pages by searching
|
||||
- `clean_header_footer`: Use heuristic to remove footers and headers across different pages by searching
|
||||
for the longest common string. This heuristic uses exact matches and therefore
|
||||
works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4"
|
||||
or similar.
|
||||
- `clean_whitespace`: strip whitespaces before or after each line in the text.
|
||||
- `clean_empty_lines`: remove more than two empty lines in the text.
|
||||
- `split_by`: split the document by "word", "sentence", or "passage". Set to None to disable splitting.
|
||||
- `split_length`: n number of splits to merge as a single document. For instance, if n -> 10 & split_by ->
|
||||
- `clean_whitespace`: Strip whitespaces before or after each line in the text.
|
||||
- `clean_empty_lines`: Remove more than two empty lines in the text.
|
||||
- `split_by`: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
|
||||
- `split_length`: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by ->
|
||||
"sentence", then each output document will have 10 sentences.
|
||||
- `split_stride`: length of striding window over the splits. For example, if split_by -> `word`,
|
||||
- `split_stride`: Length of striding window over the splits. For example, if split_by -> `word`,
|
||||
split_length -> 5 & split_stride -> 2, then the splits would be like:
|
||||
[w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12].
|
||||
Set the value to None to disable striding behaviour.
|
||||
- `split_respect_sentence_boundary`: whether to split in partial sentences when if split_by -> `word`. If set
|
||||
to True, the individual split would always have complete sentence &
|
||||
the number of words being less than or equal to the split_length.
|
||||
- `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set
|
||||
to True, the individual split will always have complete sentences &
|
||||
the number of words will be <= split_length.
|
||||
|
||||
<a name="base"></a>
|
||||
# base
|
||||
|
||||
14
docs/_src/api/api/pydoc-markdown-generator.yml
Normal file
14
docs/_src/api/api/pydoc-markdown-generator.yml
Normal file
@ -0,0 +1,14 @@
|
||||
loaders:
|
||||
- type: python
|
||||
search_path: [../../../../haystack/generator]
|
||||
ignore_when_discovered: ['__init__']
|
||||
processor:
|
||||
- type: filter
|
||||
expression: not name.startswith('_') and default()
|
||||
- documented_only: true
|
||||
- do_not_filter_modules: false
|
||||
- skip_empty_modules: true
|
||||
renderer:
|
||||
type: markdown
|
||||
descriptive_class_title: false
|
||||
filename: generator.md
|
||||
@ -314,7 +314,9 @@ See https://huggingface.co/models for full list of available QA models
|
||||
|
||||
**Arguments**:
|
||||
|
||||
- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bert-base-cased', 'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'. See https://huggingface.co/models for full list of available models.
|
||||
- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bert-base-cased',
|
||||
'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'.
|
||||
See https://huggingface.co/models for full list of available models.
|
||||
- `tokenizer`: Name of the tokenizer (usually the same as model)
|
||||
- `context_window_size`: Num of chars (before and after the answer) to return as "context" for each answer.
|
||||
The context usually helps users to understand if the answer really makes sense.
|
||||
|
||||
@ -94,7 +94,7 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que
|
||||
#### \_\_init\_\_
|
||||
|
||||
```python
|
||||
| __init__(document_store: BaseDocumentStore, query_embedding_model: str = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: str = "facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product")
|
||||
| __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product")
|
||||
```
|
||||
|
||||
Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
|
||||
@ -327,8 +327,10 @@ position in the ranking of documents the correct document is.
|
||||
| Returns a dict containing the following metrics:
|
||||
|
||||
- "recall": Proportion of questions for which correct document is among retrieved documents
|
||||
- "mean avg precision": Mean of average precision for each question. Rewards retrievers that give relevant
|
||||
documents a higher rank.
|
||||
- "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank.
|
||||
Only considers the highest ranked relevant document.
|
||||
- "map": Mean of average precision for each question. Rewards retrievers that give relevant
|
||||
documents a higher rank. Considers all retrieved relevant documents. (only with ``open_domain=False``)
|
||||
|
||||
**Arguments**:
|
||||
|
||||
|
||||
@ -1,16 +1,16 @@
|
||||
<!---
|
||||
title: "Database"
|
||||
title: "Document Store"
|
||||
metaTitle: "Document Store"
|
||||
metaDescription: ""
|
||||
slug: "/docs/database"
|
||||
slug: "/docs/documentstore"
|
||||
date: "2020-09-03"
|
||||
id: "databasemd"
|
||||
id: "documentstoremd"
|
||||
--->
|
||||
|
||||
|
||||
# Document Stores
|
||||
# DocumentStores
|
||||
|
||||
You can think of the Document Store as a "database" that:
|
||||
You can think of the DocumentStore as a "database" that:
|
||||
- stores your texts and meta data
|
||||
- provides them to the retriever at query time
|
||||
|
||||
@ -18,7 +18,7 @@ There are different DocumentStores in Haystack to fit different use cases and te
|
||||
|
||||
## Initialisation
|
||||
|
||||
Initialising a new Document Store is straight forward.
|
||||
Initialising a new DocumentStore is straight forward.
|
||||
|
||||
<div class="tabs tabsdsinstall">
|
||||
|
||||
@ -75,10 +75,13 @@ document_store = SQLDocumentStore()
|
||||
Each DocumentStore constructor allows for arguments specifying how to connect to existing databases and the names of indexes.
|
||||
See API documentation for more info.
|
||||
|
||||
## Preparing Documents
|
||||
## Input Format
|
||||
|
||||
DocumentStores expect Documents in dictionary form, like that below.
|
||||
They are loaded using the `DocumentStore.write_documents()` method.
|
||||
See [Preprocessing](/docs/latest/preprocessingmd) for more information on how to best prepare your data.
|
||||
|
||||
[//]: # (Add link to preprocessing section)
|
||||
|
||||
```python
|
||||
document_store = ElasticsearchDocumentStore()
|
||||
@ -91,28 +94,9 @@ dicts = [
|
||||
document_store.write_documents(dicts)
|
||||
```
|
||||
|
||||
## File Conversion
|
||||
|
||||
There are a range of different file converters in Haystack that can help get your data into the right format.
|
||||
Haystack features support for txt, pdf and docx formats and there is even a converted that leverages Apache Tika.
|
||||
See the File Converters section in the API docs for more information.
|
||||
|
||||
<!-- _comment: !! Code snippets for each type !! -->
|
||||
Haystack also has a `convert_files_to_dicts()` utility function that will convert
|
||||
all txt or pdf files in a given folder into this dictionary format.
|
||||
|
||||
```python
|
||||
document_store = ElasticsearchDocumentStore()
|
||||
dicts = convert_files_to_dicts(dir_path=doc_dir)
|
||||
document_store.write_documents(dicts)
|
||||
```
|
||||
|
||||
## Writing Documents
|
||||
## Writing Documents (Sparse Retrievers)
|
||||
|
||||
Haystack allows for you to write store documents in an optimised fashion so that query times can be kept low.
|
||||
|
||||
### For Sparse Retrievers
|
||||
|
||||
For **sparse**, keyword based retrievers such as BM25 and TF-IDF,
|
||||
you simply have to call `DocumentStore.write_documents()`.
|
||||
The creation of the inverted index which optimises querying speed is handled automatically.
|
||||
@ -121,7 +105,7 @@ The creation of the inverted index which optimises querying speed is handled aut
|
||||
document_store.write_documents(dicts)
|
||||
```
|
||||
|
||||
### For Dense Retrievers
|
||||
## Writing Documents (Dense Retrievers)
|
||||
|
||||
For **dense** neural network based retrievers like Dense Passage Retrieval, or Embedding Retrieval,
|
||||
indexing involves computing the Document embeddings which will be compared against the Query embedding.
|
||||
@ -139,9 +123,9 @@ Having GPU acceleration will significantly speed this up.
|
||||
|
||||
<!-- _comment: !! Diagrams of inverted index / document embeds !! -->
|
||||
<!-- _comment: !! Make this a tab element to show how different datastores are initialized !! -->
|
||||
## Choosing the right document store
|
||||
## Choosing the Right Document Store
|
||||
|
||||
The Document stores have different characteristics. You should choose one depending on the maturity of your project, the use case and technical environment:
|
||||
The Document Stores have different characteristics. You should choose one depending on the maturity of your project, the use case and technical environment:
|
||||
|
||||
<div class="tabs tabsdschoose">
|
||||
|
||||
@ -213,7 +197,7 @@ The Document stores have different characteristics. You should choose one depend
|
||||
|
||||
</div>
|
||||
|
||||
#### Our recommendations
|
||||
#### Our Recommendations
|
||||
|
||||
**Restricted environment:** Use the `InMemoryDocumentStore`, if you are just giving Haystack a quick try on a small sample and are working in a restricted environment that complicates running Elasticsearch or other databases
|
||||
|
||||
174
docs/_src/usage/usage/languages.md
Normal file
174
docs/_src/usage/usage/languages.md
Normal file
@ -0,0 +1,174 @@
|
||||
<!---
|
||||
title: "Languages Other Than English"
|
||||
metaTitle: "Languages Other Than English"
|
||||
metaDescription: ""
|
||||
slug: "/docs/languages"
|
||||
date: "2020-11-05"
|
||||
id: "languagesmd"
|
||||
--->
|
||||
|
||||
# Languages Other Than English
|
||||
|
||||
Haystack is well suited to open-domain QA on languages other than English.
|
||||
While our defaults are tuned for English,
|
||||
you will find some tips and tricks here for using Haystack in your language.
|
||||
|
||||
##Retrievers
|
||||
|
||||
The sparse retriever methods themselves(BM25, TF-IDF) are language agnostic.
|
||||
Their only requirement is that the text be split into words.
|
||||
The ElasticsearchDocumentStore relies on an analyzer to impose word boundaries,
|
||||
but also to handle punctuation, casing and stop words.
|
||||
|
||||
The default analyzer is an English analyzer.
|
||||
While it can still work decently for a large range of languages,
|
||||
you will want to set it to your language's analyzer for optimal performance.
|
||||
In some cases, such as with Thai, the default analyzer is completely incompatible.
|
||||
See [this page](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html)
|
||||
for the full list of language specific analyzers.
|
||||
|
||||
```python
|
||||
document_store = ElasticsearchDocumentStore(analyzer="thai")
|
||||
```
|
||||
|
||||
The models used in dense retrievers are language specific.
|
||||
Be sure to check language of the model used in your EmbeddingRetriever.
|
||||
The default model that is loaded in the DensePassageRetriever is for English.
|
||||
We are currently working on training a German DensePassageRetriever model and know other teams who work on further languages.
|
||||
If you have a language model and a question answering dataset in your own language, you can also train a DPR model using Haystack!
|
||||
Below is a simplified example.
|
||||
See the [API reference](/docs/latest/apiretrievermd#train) for `DensePassageRetriever.train()` for more details.
|
||||
|
||||
```python
|
||||
dense_passage_retriever.train(self,
|
||||
data_dir: str,
|
||||
train_filename: str,
|
||||
dev_filename: str = None,
|
||||
test_filename: str = None,
|
||||
batch_size: int = 16,
|
||||
embed_title: bool = True,
|
||||
num_hard_negatives: int = 1,
|
||||
n_epochs: int = 3)
|
||||
```
|
||||
|
||||
##Readers
|
||||
|
||||
While models are comparatively more performant on English,
|
||||
thanks to a wealth of available English training data,
|
||||
there are a couple QA models that are directly usable in Haystack.
|
||||
|
||||
<div class="tabs tabsreaderlanguage">
|
||||
|
||||
<div class="tab">
|
||||
<input type="radio" id="tab-4-1" name="tab-group-4" checked>
|
||||
<label class="labelouter" for="tab-4-1">FARM</label>
|
||||
<div class="tabcontent">
|
||||
|
||||
<div class="tabs innertabslanguage">
|
||||
|
||||
<div class="tabinner">
|
||||
<input type="radio" id="tab-5-1" name="tab-group-5" checked>
|
||||
<label class="labelinner" for="tab-5-1">French</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = FARMReader("illuin/camembert-base-fquad")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tabinner">
|
||||
<input type="radio" id="tab-5-2" name="tab-group-5">
|
||||
<label class="labelinner" for="tab-5-2">Italian</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = FARMReader("mrm8488/bert-italian-finedtuned-squadv1-it-alfa")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tabinner">
|
||||
<input type="radio" id="tab-5-3" name="tab-group-5">
|
||||
<label class="labelinner" for="tab-5-3">Zero-shot</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = FARMReader("deepset/xlm-roberta-large-squad2")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tab">
|
||||
<input type="radio" id="tab-4-2" name="tab-group-4">
|
||||
<label class="labelouter" for="tab-4-2">Transformers</label>
|
||||
<div class="tabcontent">
|
||||
|
||||
<div class="tabs innertabslanguage">
|
||||
|
||||
<div class="tabinner2">
|
||||
<input type="radio" id="tab-6-1" name="tab-group-6" checked>
|
||||
<label class="labelinner" for="tab-6-1">French</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = TransformersReader("illuin/camembert-base-fquad")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tabinner2">
|
||||
<input type="radio" id="tab-6-2" name="tab-group-6">
|
||||
<label class="labelinner" for="tab-6-2">Italian</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = TransformersReader("mrm8488/bert-italian-finedtuned-squadv1-it-alfa")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tabinner2">
|
||||
<input type="radio" id="tab-6-3" name="tab-group-6">
|
||||
<label class="labelinner" for="tab-6-3">Zero-shot</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = TransformersReader("deepset/xlm-roberta-large-squad2")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
The **French** and **Italian models** are both monolingual language models trained on French and Italian versions of the SQuAD dataset
|
||||
and their authors report decent results in their model cards
|
||||
[here](https://huggingface.co/illuin/camembert-base-fquad) and [here](https://huggingface.co/mrm8488/bert-italian-finedtuned-squadv1-it-alfa) .
|
||||
Note that there is also a [large variant](https://huggingface.co/illuin/camembert-large-fquad) of the French model available on the model hub.
|
||||
There also exist Korean QA models on the model hub but their performance is not reported.
|
||||
|
||||
The **zero-shot model** that is shown above is a **multilingual XLM-RoBERTa Large** that is trained on English SQuAD.
|
||||
It is clear, from our [evaluations](https://huggingface.co/deepset/xlm-roberta-large-squad2#model_card),
|
||||
that the model has been able to transfer some of its English QA capabilities to other languages,
|
||||
but still its performance lags behind that of the monolingual models.
|
||||
Nonetheless, if there is not yet a monolingual model for your language and it is one of the 100 supported by XLM-RoBERTa,
|
||||
this zero-shot model may serve as a decent first baseline.
|
||||
|
||||
[//]: # (Add link to Reader training, create section in reader.md on training Reader)
|
||||
|
||||
135
docs/_src/usage/usage/preprocessing.md
Normal file
135
docs/_src/usage/usage/preprocessing.md
Normal file
@ -0,0 +1,135 @@
|
||||
<!---
|
||||
title: "Preprocessing"
|
||||
metaTitle: "Preprocessing"
|
||||
metaDescription: ""
|
||||
slug: "/docs/preprocessing"
|
||||
date: "2020-09-03"
|
||||
id: "preprocessingmd"
|
||||
--->
|
||||
|
||||
# Preprocessing
|
||||
|
||||
Haystack includes a suite of tools to:
|
||||
|
||||
* extract text from different file types,
|
||||
* normalize white space
|
||||
* split text into smaller pieces to optimize retrieval
|
||||
|
||||
These data preprocessing steps can have a big impact on the systems performance
|
||||
and effective handling of data is key to getting the most out of Haystack.
|
||||
|
||||
The Document Store expects its inputs to come in the following format.
|
||||
The sections below will show you all the tools you'll need to ready your data for storing.
|
||||
|
||||
```python
|
||||
docs = [
|
||||
{
|
||||
'text': DOCUMENT_TEXT_HERE,
|
||||
'meta': {'name': DOCUMENT_NAME, ...}
|
||||
}, ...
|
||||
]
|
||||
```
|
||||
|
||||
## File Conversion
|
||||
|
||||
There are a range of different file converters in Haystack that
|
||||
can extract text from files and cast them into the unified dictionary format shown above.
|
||||
Haystack features support for txt, pdf and docx files and there is even a converter that leverages Apache Tika.
|
||||
Please refer to [the API docs](/docs/latest/pdf) to see which converter best suits you.
|
||||
|
||||
<div class="tabs tabsconverters">
|
||||
|
||||
<div class="tab">
|
||||
<input type="radio" id="tab-1" name="tab-group-1" checked>
|
||||
<label class="labelouter" for="tab-1">PDF</label>
|
||||
<div class="tabcontent">
|
||||
|
||||
```python
|
||||
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["de","en"])
|
||||
doc = converter.convert(file_path=file, meta=None)
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tab">
|
||||
<input type="radio" id="tab-2" name="tab-group-1">
|
||||
<label class="labelouter" for="tab-2">DOCX</label>
|
||||
<div class="tabcontent">
|
||||
|
||||
```python
|
||||
converter = DocxToTextConverter(remove_numeric_tables=True, valid_languages=["de","en"])
|
||||
doc = converter.convert(file_path=file, meta=None)
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tab">
|
||||
<input type="radio" id="tab-3" name="tab-group-1">
|
||||
<label class="labelouter" for="tab-3">From a Directory</label>
|
||||
<div class="tabcontent">
|
||||
|
||||
|
||||
Haystack also has a `convert_files_to_dicts()` utility function that will convert
|
||||
all txt or pdf files in a given folder into this dictionary format.
|
||||
|
||||
```python
|
||||
docs = convert_files_to_dicts(dir_path=doc_dir)
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
## PreProcessor
|
||||
|
||||
While each of the above conversion methods produce documents that are already in the format expected by the Document Store,
|
||||
it is recommended that they are further processed in order to ensure optimal Retriever and Reader performance.
|
||||
The `PreProcessor` takes one of the documents created by the converter as input,
|
||||
performs various cleaning steps and splits them into multiple smaller documents.
|
||||
|
||||
```python
|
||||
doc = converter.convert(file_path=file, meta=None)
|
||||
processor = PreProcessor(clean_empty_lines=True,
|
||||
clean_whitespace=True,
|
||||
clean_header_footer=True,
|
||||
split_by="word",
|
||||
split_length=200,
|
||||
split_respect_sentence_boundary=True)
|
||||
docs = processor.process(d)
|
||||
```
|
||||
|
||||
* `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines
|
||||
* `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text
|
||||
* `clean_header_footer` will remove any long header or footer texts that are repeated on each page
|
||||
* `split_by` determines what unit the document is split by: `'word'`, `'sentence'` or `'passage'`
|
||||
* `split_length` sets a maximum number of `'word'`, `'sentence'` or `'passage'` units per output document
|
||||
* `split_respect_sentence_boundary` ensures that document boundaries do not fall in the middle of sentences
|
||||
|
||||
## Impact of Document Splitting
|
||||
|
||||
The File Converters will treat each file as a single document regardless of length.
|
||||
This is not always ideal as long documents can have negative impacts on both speed and accuracy.
|
||||
|
||||
Document length has a very direct impact on the speed of the Reader.
|
||||
**If you halve the length of your documents, you can expect that the Reader will double in speed.**
|
||||
|
||||
It is generally not a good idea to let document boundaries fall in the middle of sentences.
|
||||
Doing so means that each document will contain incomplete sentence fragments
|
||||
which maybe be hard for both retriever and reader to interpret.
|
||||
|
||||
For **sparse retrievers**, very long documents pose a challenge since the signal of the relevant section of text
|
||||
can get washed out by the rest of the document.
|
||||
We would recommend making sure that **documents are no longer than 10,000 words**.
|
||||
|
||||
**Dense retrievers** are limited in the length of text that they can read in one pass.
|
||||
As such, it is important that documents are not longer than the dense retriever's maximum input length.
|
||||
By default, Haystack's DensePassageRetriever model has a maximum length of 256 tokens.
|
||||
As such, we recommend that documents contain significantly less words.
|
||||
We have found decent performance with **documents around 100 words long**.
|
||||
|
||||
|
||||
|
||||
|
||||
@ -219,129 +219,6 @@ But if performance is your sole concern, and you have the computational resource
|
||||
you might like to try ALBERT XXL which has set SoTA performance on SQuAD 2.0.
|
||||
|
||||
<!-- _comment: !! How good is it? How much computation resource do you need to run it? !! -->
|
||||
## Languages other than English
|
||||
|
||||
Haystack is also very well suited to open-domain QA on languages other than English.
|
||||
While models are comparatively more performant on English,
|
||||
thanks to a wealth of available English training data,
|
||||
there are a couple QA models that are directly usable in Haystack.
|
||||
|
||||
<div class="tabs tabsreaderlanguage">
|
||||
|
||||
<div class="tab">
|
||||
<input type="radio" id="tab-4-1" name="tab-group-4" checked>
|
||||
<label class="labelouter" for="tab-4-1">FARM</label>
|
||||
<div class="tabcontent">
|
||||
|
||||
<div class="tabs innertabslanguage">
|
||||
|
||||
<div class="tabinner">
|
||||
<input type="radio" id="tab-5-1" name="tab-group-5" checked>
|
||||
<label class="labelinner" for="tab-5-1">French</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = FARMReader("illuin/camembert-base-fquad")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tabinner">
|
||||
<input type="radio" id="tab-5-2" name="tab-group-5">
|
||||
<label class="labelinner" for="tab-5-2">Italian</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = FARMReader("mrm8488/bert-italian-finedtuned-squadv1-it-alfa")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tabinner">
|
||||
<input type="radio" id="tab-5-3" name="tab-group-5">
|
||||
<label class="labelinner" for="tab-5-3">Zero-shot</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = FARMReader("deepset/xlm-roberta-large-squad2")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tab">
|
||||
<input type="radio" id="tab-4-2" name="tab-group-4">
|
||||
<label class="labelouter" for="tab-4-2">Transformers</label>
|
||||
<div class="tabcontent">
|
||||
|
||||
<div class="tabs innertabslanguage">
|
||||
|
||||
<div class="tabinner2">
|
||||
<input type="radio" id="tab-6-1" name="tab-group-6" checked>
|
||||
<label class="labelinner" for="tab-6-1">French</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = TransformersReader("illuin/camembert-base-fquad")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tabinner2">
|
||||
<input type="radio" id="tab-6-2" name="tab-group-6">
|
||||
<label class="labelinner" for="tab-6-2">Italian</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = TransformersReader("mrm8488/bert-italian-finedtuned-squadv1-it-alfa")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tabinner2">
|
||||
<input type="radio" id="tab-6-3" name="tab-group-6">
|
||||
<label class="labelinner" for="tab-6-3">Zero-shot</label>
|
||||
<div class="tabcontentinner">
|
||||
|
||||
```python
|
||||
reader = TransformersReader("deepset/xlm-roberta-large-squad2")
|
||||
```
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
The **French** and **Italian models** are both monolingual language models trained on French and Italian versions of the SQuAD dataset
|
||||
and their authors report decent results in their model cards
|
||||
[here](https://huggingface.co/illuin/camembert-base-fquad) and [here](https://huggingface.co/mrm8488/bert-italian-finedtuned-squadv1-it-alfa) .
|
||||
Note that there is also a [large variant](https://huggingface.co/illuin/camembert-large-fquad) of the French model available on the model hub.
|
||||
There also exist Korean QA models on the model hub but their performance is not reported.
|
||||
|
||||
The **zero-shot model** that is shown above is a **multilingual XLM-RoBERTa Large** that is trained on English SQuAD.
|
||||
It is clear, from our [evaluations](https://huggingface.co/deepset/xlm-roberta-large-squad2#model_card),
|
||||
that the model has been able to transfer some of its English QA capabilities to other languages,
|
||||
but still its performance lags behind that of the monolingual models.
|
||||
Nonetheless, if there is not yet a monolingual model for your language and it is one of the 100 supported by XLM-RoBERTa,
|
||||
this zero-shot model may serve as a decent first baseline.
|
||||
|
||||
When using a Reader of any language, it’s important to ensure that the Retriever is also compatible.
|
||||
While sparse methods like BM25 and TF-IDF are language agnostic,
|
||||
dense method like Dense Passage Retrieval are trained for a particular language.
|
||||
|
||||
<!-- farm-vs-trans: -->
|
||||
## Deeper Dive: FARM vs Transformers
|
||||
|
||||
@ -1,11 +1,21 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Dict
|
||||
|
||||
from haystack import Document
|
||||
|
||||
|
||||
class BaseGenerator(ABC):
|
||||
|
||||
"""
|
||||
Abstract class for Generators
|
||||
"""
|
||||
@abstractmethod
|
||||
def predict(self, question: str, documents: List[Document], top_k: Optional[int]):
|
||||
def predict(self, question: str, documents: List[Document], top_k: Optional[int]) -> Dict:
|
||||
"""
|
||||
Abstract method to generate answers.
|
||||
|
||||
:param question: Question
|
||||
:param documents: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
|
||||
:param top_k: Number of returned answers
|
||||
:return: Generated answers plus additional infos in a dict
|
||||
"""
|
||||
pass
|
||||
|
||||
@ -23,9 +23,39 @@ class RAGenerator(BaseGenerator):
|
||||
Implementation of Facebook's Retrieval-Augmented Generator (https://arxiv.org/abs/2005.11401) based on
|
||||
HuggingFace's transformers (https://huggingface.co/transformers/model_doc/rag.html).
|
||||
|
||||
| With the generator, you can:
|
||||
Instead of "finding" the answer within a document, these models **generate** the answer.
|
||||
In that sense, RAG follows a similar approach as GPT-3 but it comes with two huge advantages
|
||||
for real-world applications:
|
||||
a) it has a manageable model size
|
||||
b) the answer generation is conditioned on retrieved documents,
|
||||
i.e. the model can easily adjust to domain documents even after training has finished
|
||||
(in contrast: GPT-3 relies on the web data seen during training)
|
||||
|
||||
- directly get generate predictions via predict()
|
||||
**Example**
|
||||
|
||||
```python
|
||||
> question = "who got the first nobel prize in physics?"
|
||||
|
||||
# Retrieve related documents from retriever
|
||||
> retrieved_docs = retriever.retrieve(query=question)
|
||||
|
||||
> # Now generate answer from question and retrieved documents
|
||||
> generator.predict(
|
||||
> question=question,
|
||||
> documents=retrieved_docs,
|
||||
> top_k=1
|
||||
> )
|
||||
{'question': 'who got the first nobel prize in physics',
|
||||
'answers':
|
||||
[{'question': 'who got the first nobel prize in physics',
|
||||
'answer': ' albert einstein',
|
||||
'meta': { 'doc_ids': [...],
|
||||
'doc_scores': [80.42758 ...],
|
||||
'doc_probabilities': [40.71379089355469, ...
|
||||
'texts': ['Albert Einstein was a ...]
|
||||
'titles': ['"Albert Einstein"', ...]
|
||||
}}]}
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -144,7 +174,29 @@ class RAGenerator(BaseGenerator):
|
||||
|
||||
return embeddings_in_tensor
|
||||
|
||||
def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None):
|
||||
def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None) -> Dict:
|
||||
"""
|
||||
Generate the answer to the input question. The generation will be conditioned on the supplied documents.
|
||||
These document can for example be retrieved via the Retriever.
|
||||
|
||||
:param question: Question
|
||||
:param documents: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on.
|
||||
:param top_k: Number of returned answers
|
||||
:return: Generated answers plus additional infos in a dict like this:
|
||||
|
||||
```python
|
||||
> {'question': 'who got the first nobel prize in physics',
|
||||
> 'answers':
|
||||
> [{'question': 'who got the first nobel prize in physics',
|
||||
> 'answer': ' albert einstein',
|
||||
> 'meta': { 'doc_ids': [...],
|
||||
> 'doc_scores': [80.42758 ...],
|
||||
> 'doc_probabilities': [40.71379089355469, ...
|
||||
> 'texts': ['Albert Einstein was a ...]
|
||||
> 'titles': ['"Albert Einstein"', ...]
|
||||
> }}]}
|
||||
```
|
||||
"""
|
||||
if len(documents) == 0:
|
||||
raise AttributeError("generator need documents to predict the answer")
|
||||
|
||||
|
||||
@ -1,15 +1,17 @@
|
||||
from utils import get_document_store, index_to_doc_store, get_reader
|
||||
from haystack.preprocessor.utils import eval_data_from_file
|
||||
from farm.data_handler.utils import _download_extract_downstream_data
|
||||
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from results_to_json import reader as reader_json
|
||||
from templates import READER_TEMPLATE
|
||||
import json
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
reader_models_full = ["deepset/roberta-base-squad2", "deepset/minilm-uncased-squad2",
|
||||
"deepset/bert-base-cased-squad2", "deepset/bert-large-uncased-whole-word-masking-squad2",
|
||||
"deepset/xlm-roberta-large-squad2", "distilbert-base-uncased-distilled-squad"]
|
||||
reader_models_full = ["deepset/roberta-base-squad2"]
|
||||
reader_models_ci = ["deepset/minilm-uncased-squad2"]
|
||||
|
||||
reader_types = ["farm"]
|
||||
@ -17,7 +19,8 @@ data_dir = Path("../../data/squad20")
|
||||
filename = "dev-v2.0.json"
|
||||
# Note that this number is approximate - it was calculated using Bert Base Cased
|
||||
# This number could vary when using a different tokenizer
|
||||
n_passages = 12350
|
||||
n_total_passages = 12350
|
||||
n_total_docs = 1204
|
||||
|
||||
results_file = "reader_results.csv"
|
||||
|
||||
@ -26,27 +29,34 @@ reader_json_file = "../../docs/_src/benchmarks/reader_performance.json"
|
||||
doc_index = "eval_document"
|
||||
label_index = "label"
|
||||
|
||||
def benchmark_reader(ci=False, update_json=False, **kwargs):
|
||||
def benchmark_reader(ci=False, update_json=False, save_markdown=False, **kwargs):
|
||||
if ci:
|
||||
reader_models = reader_models_ci
|
||||
n_docs = 1
|
||||
max_docs = 100
|
||||
# heuristic to estimate num of passages for the reduced num of docs
|
||||
n_passages = n_total_passages * (max_docs / n_total_docs)
|
||||
else:
|
||||
reader_models = reader_models_full
|
||||
n_docs = None
|
||||
max_docs = None
|
||||
n_passages = n_total_passages
|
||||
reader_results = []
|
||||
doc_store = get_document_store("elasticsearch")
|
||||
docs, labels = eval_data_from_file(data_dir/filename, n_docs)
|
||||
# download squad data
|
||||
_download_extract_downstream_data(input_file=data_dir/filename)
|
||||
docs, labels = eval_data_from_file(data_dir/filename, max_docs)
|
||||
|
||||
index_to_doc_store(doc_store, docs, None, labels)
|
||||
for reader_name in reader_models:
|
||||
for reader_type in reader_types:
|
||||
logger.info(f"##### Start reader run - model:{reader_name}, type: {reader_type} ##### ")
|
||||
try:
|
||||
reader = get_reader(reader_name, reader_type)
|
||||
results = reader.eval(document_store=doc_store,
|
||||
doc_index=doc_index,
|
||||
label_index=label_index,
|
||||
device="cuda")
|
||||
# print(results)
|
||||
# results = reader.eval_on_file(data_dir, filename, device="cuda")
|
||||
print(results)
|
||||
results["passages_per_second"] = n_passages / results["reader_time"]
|
||||
results["reader"] = reader_name
|
||||
results["error"] = ""
|
||||
@ -64,10 +74,15 @@ def benchmark_reader(ci=False, update_json=False, **kwargs):
|
||||
reader_results.append(results)
|
||||
reader_df = pd.DataFrame.from_records(reader_results)
|
||||
reader_df.to_csv(results_file)
|
||||
if save_markdown:
|
||||
md_file = results_file.replace(".csv", ".md")
|
||||
with open(md_file, "w") as f:
|
||||
f.write(str(reader_df.to_markdown()))
|
||||
if update_json:
|
||||
populate_reader_json()
|
||||
|
||||
|
||||
|
||||
def populate_reader_json():
|
||||
reader_results = reader_json()
|
||||
template = READER_TEMPLATE
|
||||
@ -76,4 +91,4 @@ def populate_reader_json():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
benchmark_reader(True, update_json=True)
|
||||
benchmark_reader(ci=False, update_json=False, save_markdown=False)
|
||||
Loading…
x
Reference in New Issue
Block a user