updated tutorials (#1359)

This commit is contained in:
Markus Paff 2021-08-19 21:16:56 +02:00 committed by GitHub
parent a3c746abf5
commit ff2049cd45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 10 additions and 14 deletions

View File

@ -133,7 +133,7 @@ dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, spl
# 'meta': {'name': "<DOCUMENT_NAME_HERE>", ...}
#}
# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and
# can be accessed later for filtering or shown in the responses of the Finder)
# can be accessed later for filtering or shown in the responses of the Pipeline)
# Let's have a look at the first 3 entries:
print(dicts[:3])
@ -142,7 +142,7 @@ print(dicts[:3])
document_store.write_documents(dicts)
```
## Initalize Retriever, Reader, & Finder
## Initalize Retriever, Reader, & Pipeline
### Retriever

View File

@ -34,7 +34,7 @@ These lines are to install Haystack through pip
```python
# Install the latest release of Haystack in your own environment
#! pip install farm-haystack
!pip install farm-haystack
# Install the latest master of Haystack
!pip install grpcio-tools==1.34.1

View File

@ -72,9 +72,9 @@ Then change the `use_gpu` arguments below to `True`
```python
reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True)
train_data = "data/squad20"
# train_data = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=train_data, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model")
data_dir = "data/squad20"
# data_dir = "PATH/TO_YOUR/TRAIN_DATA"
reader.train(data_dir=data_dir, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model")
```

View File

@ -44,7 +44,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
```python
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
@ -102,7 +101,7 @@ print(dicts[:3])
document_store.write_documents(dicts)
```
## Initalize Retriever, Reader, & Finder
## Initalize Retriever, Reader & Pipeline
### Retriever

View File

@ -52,7 +52,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
```python
from haystack import Finder
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.dense import EmbeddingRetriever

View File

@ -148,7 +148,7 @@ retriever = ElasticsearchRetriever(document_store=document_store)
# Initialize Reader
from haystack.reader.farm import FARMReader
reader = FARMReader("deepset/roberta-base-squad2", top_k_per_candidate=4, return_no_answer=True)
reader = FARMReader("deepset/roberta-base-squad2", top_k=4, return_no_answer=True)
```

View File

@ -85,7 +85,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
```python
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
@ -146,7 +145,7 @@ dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, spl
document_store.write_documents(dicts)
```
### Initalize Retriever, Reader, & Finder
### Initalize Retriever, Reader & Pipeline
#### Retriever

View File

@ -190,8 +190,7 @@ preprocessor = PreProcessor(
split_length=100,
split_respect_sentence_boundary=True
)
nested_docs = [preprocessor.process(d) for d in all_docs]
docs = [d for x in nested_docs for d in x]
docs = preprocessor.process(all_docs)
print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")
```