Fix Tutorials and Tutorials (nightly) (#2737)

* Remove caching and install audio deps

* Fix `Tutorials` as well

* Run all tutorials even though some fail

* Forgot fi

* fix failure condition

* proper bash string equality

* Enable debug logs

* remove audio files

* Update Documentation & Code Style

* Use the setup action in the Tutorial CI as well

* Try with a file that exists

* Update Documentation & Code Style

* Fix the comments in the tutorials

* Update Documentation & Code Style

* Fix tutorials.sh

* Remove debug logging

* import pprint and try editable install

* Update Documentation & Code Style

* extract no run list

* Add tutorial18 to no run list nightly

* import pprint correctly

* Update Documentation & Code Style

* try making site-packages editable

* Make pythonpath editable every time Tut17 is run on CI

* typo

* fix imports in tut5

* add git clean

* Update Documentation & Code Style

* add comments and remove` -e`

* accidentally deleted a line

* Update .github/utils/tutorials.sh

Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
This commit is contained in:
Sara Zan 2022-07-12 11:22:17 +02:00 committed by GitHub
parent 4d8f40425b
commit 091711b8c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 740 additions and 593 deletions

View File

@ -2,11 +2,11 @@
export LAUNCH_GRAPHDB=0 # See tut 10 - GraphDB is already running in CI
export TIKA_LOG_PATH=$PWD # Avoid permission denied errors while importing tika
set -e # Fails on any error in the following loop
python_path=$1
files_changed=$2
exclusion_list=$3
make_python_path_editable=$4
no_got_tutorials='4_FAQ_style_QA 5_Evaluation 7_RAG_Generator 8_Preprocessing 10_Knowledge_Graph 15_TableQA 16_Document_Classifier_at_Index_Time'
echo "Files changed in this PR: $files_changed"
@ -33,6 +33,7 @@ for script in $files_changed; do
scripts_to_run="$scripts_to_run $script"
done
failed=""
for script in $scripts_to_run; do
echo ""
@ -61,15 +62,54 @@ for script in $scripts_to_run; do
echo "NOT using reduced GoT dataset!"
fi
# FIXME Make the Python path editable
# espnet needs to edit files on the PYTHONPATH during execution. However, by default GH runners don't allow
# workflows to edit files into that directory, so in case of tutorials using espnet, we need to make PYTHONPATH
# editable first. For now it's only Tutorial 17.
# Still unclear why it's needed to repeat this operation, but if Tutorial 17 is run twice (once for the .py
# and once for .ipynb version) the error re-appears.
if [[ $make_python_path_editable == "EDITABLE" ]] && [[ "$script" == *"Tutorial17_"* ]]; then
sudo find $python_path/lib -type f -exec chmod 777 {} \;
fi
if [[ "$script" == *".py" ]]; then
time python $script
else
sudo $python_path/bin/ipython -c "%run $script"
fi
if [ ! $? -eq 0 ]; then
failed=$failed" "$script
fi
# Clean up datasets and SQLite DBs to avoid crashing the next tutorial
git clean -f
done
# causes permission errors on Post Cache
sudo rm -rf data/
sudo rm -rf /home/runner/work/haystack/haystack/elasticsearch-7.9.2/
sudo rm -rf /home/runner/work/haystack/haystack/elasticsearch-7.9.2/
if [[ $failed == "" ]]; then
echo ""
echo ""
echo "------------------------------------------"
echo " All tutorials were executed successfully "
echo "------------------------------------------"
exit 0
else
echo ""
echo "##################################################################################"
echo "## ##"
echo "## Some tutorials have failed! ##"
echo "## ##"
echo "##################################################################################"
for script in $failed; do
echo "## - $script"
done
echo "##################################################################################"
exit 1
fi

View File

@ -7,6 +7,11 @@ on:
- 'tutorials/*.*'
env:
# Tutorials that require a GPU to run, so can't be run on CI without self-hosted runners
DONT_RUN: Tutorial2_ Tutorial9_ Tutorial13_ Tutorial18_
jobs:
run:
@ -14,21 +19,9 @@ jobs:
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
- name: Cache Python
uses: actions/cache@v2
with:
path: ${{ env.pythonLocation }}
key: linux-${{ env.date }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/setup.cfg') }}-${{ hashFiles('**/pyproject.toml') }}
- name: Setup Python
uses: ./.github/actions/python_cache/
- name: Run Elasticsearch
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
@ -42,8 +35,16 @@ jobs:
- name: Install pdftotext
run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
- name: Install graphviz
run: sudo apt install libgraphviz-dev graphviz
# Some tutorials require these libraries to run
# - Tutorial 17 for the audio libs
# - Tutorial 11, 14, 15, 16 for pygraphviz
- name: Install graphviz and audio libs
run: sudo apt install libgraphviz-dev graphviz libsndfile1 ffmpeg
# Some tutorials require these libraries to run
# - Tutorial 15
- name: Install torch-scatter
run: pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
# Haystack needs to be reinstalled at this stage to make sure the current commit's version is the one getting tested.
# The cache can last way longer than a specific action's run, so older Haystack version could be carried over.
@ -51,7 +52,6 @@ jobs:
run: |
pip install --upgrade pip
pip install .[all]
pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
pip install pygraphviz
pip install ipython nbformat
@ -71,4 +71,4 @@ jobs:
token: ${{ secrets.GITHUB_TOKEN }}
- name: Run tutorials
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "${{ steps.diff.outputs.added_modified }}" "Tutorial2_ Tutorial9_ Tutorial13_ Tutorial18_"
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "${{ steps.diff.outputs.added_modified }}" "${{ env.DONT_RUN }}" "EDITABLE"

View File

@ -6,6 +6,11 @@ on:
- cron: '0 0 * * *'
env:
# Tutorials that require a GPU to run, so can't be run on CI without self-hosted runners
DONT_RUN: Tutorial2_ Tutorial9_ Tutorial13_ Tutorial18_
jobs:
notebooks:
@ -13,19 +18,12 @@ jobs:
steps:
- uses: actions/checkout@v2
- run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
- name: Cache Python
uses: actions/cache@v2
with:
path: ${{ env.pythonLocation }}
key: linux-${{ env.date }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/setup.cfg') }}-${{ hashFiles('**/pyproject.toml') }}
- name: Run Elasticsearch
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
@ -38,16 +36,21 @@ jobs:
- name: Install pdftotext
run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
- name: Install graphviz
run: sudo apt install libgraphviz-dev graphviz
# Some tutorials require these libraries to run
# - Tutorial 17 for the audio libs
# - Tutorial 11, 14, 15, 16 for pygraphviz
- name: Install graphviz and audio libs
run: sudo apt install libgraphviz-dev graphviz libsndfile1 ffmpeg
# Some tutorials require these libraries to run
# - Tutorial 15
- name: Install torch-scatter
run: pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
# Haystack needs to be reinstalled at this stage to make sure the current commit's version is the one getting tested.
# The cache can last way longer than a specific action's run, so older Haystack version could be carried over.
- name: Reinstall Haystack
- name: Install Haystack
run: |
pip install --upgrade pip
pip install .[all]
pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
pip install pygraphviz
pip install ipython nbformat
@ -60,7 +63,7 @@ jobs:
rm wiki_gameofthrones_txt1_mini.zip
- name: Run tutorials
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "tutorials/*.ipynb" "Tutorial2_ Tutorial9_ Tutorial13_"
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "tutorials/*.ipynb" "${{ env.DONT_RUN }}"
scripts:
@ -68,19 +71,12 @@ jobs:
steps:
- uses: actions/checkout@v2
- run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
- name: Cache Python
uses: actions/cache@v2
with:
path: ${{ env.pythonLocation }}
key: linux-${{ env.date }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/setup.cfg') }}-${{ hashFiles('**/pyproject.toml') }}
- name: Run Elasticsearch
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
@ -92,17 +88,22 @@ jobs:
- name: Install pdftotext
run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
# Some tutorials require these libraries to run
# - Tutorial 17 for the audio libs
# - Tutorial 11, 14, 15, 16 for pygraphviz
- name: Install graphviz and audio libs
run: sudo apt install libgraphviz-dev graphviz libsndfile1 ffmpeg
# Some tutorials require these libraries to run
# - Tutorial 15
- name: Install torch-scatter
run: pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
- name: Install graphviz
run: sudo apt install libgraphviz-dev graphviz
# Haystack needs to be reinstalled at this stage to make sure the current commit's version is the one getting tested.
# The cache can last way longer than a specific action's run, so older Haystack version could be carried over.
- name: Reinstall Haystack
- name: Install Haystack
run: |
pip install --upgrade pip
pip install .[all]
pip install torch-scatter -f https://data.pyg.org/whl/torch-1.11.0+cpu.html
pip install pygraphviz
pip install ipython nbformat
@ -115,4 +116,4 @@ jobs:
rm wiki_gameofthrones_txt1_mini.zip
- name: Run tutorials
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "tutorials/*.py" "Tutorial2_ Tutorial9_ Tutorial13_"
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "tutorials/*.py" "${{ env.DONT_RUN }}"

View File

@ -97,7 +97,7 @@ file_paths = [p for p in Path(documents_path).glob("**/*")]
# NOTE: In this example we're going to use only one text file from the wiki, as the DocumentToSpeech node is quite slow
# on CPU machines. Comment out this line to use all documents from the dataset if you machine is powerful enough.
file_paths = [p for p in file_paths if "Arya_Stark" in p.name]
file_paths = [p for p in file_paths if "Stormborn" in p.name]
# Prepare some basic metadata for the files
files_metadata = [{"name": path.name} for path in file_paths]
@ -144,6 +144,37 @@ indexing_pipeline.add_node(document_store, name="document_store", inputs=["doc2s
output = indexing_pipeline.run(file_paths=file_paths, meta=files_metadata)
```
```python
from pprint import pprint
# You can now check the document store and verify that documents have been enriched with a path
# to the generated audio file
document = next(document_store.get_all_documents_generator())
pprint(document)
# Sample output:
#
# <Document: {
# 'content': "'Stormborn' received praise from critics, who considered Euron Greyjoy's raid on Yara's Iron Fleet,
# the assembly of Daenerys' allies at Dragonstone, and Arya's reunion with her direwolf Nymeria as
# highlights of the episode. In the United States, it achieved a viewership of 9.27 million in its
# initial broadcast.",
# 'content_type': 'audio',
# 'score': None,
# 'meta': {
# 'content_audio': './generated_audio_documents/f218707624d9c4f9487f508e4603bf5b.wav',
# '__initialised__': True,
# 'type': 'generative',
# '_split_id': 0,
# 'audio_format': 'wav',
# 'sample_rate': 22050,
# 'name': '2_Stormborn.txt'},
# 'embedding': None,
# 'id': '2733e698301f8f94eb70430b874177fd'
# }>
```
### Querying
Now we will create a pipeline very similar to the basic `ExtractiveQAPipeline` of Tutorial 1,
@ -189,7 +220,7 @@ pprint(prediction)
# {
# 'answers': [ <SpeechAnswer:
# answer_audio=PosixPath('generated_audio_answers/fc704210136643b833515ba628eb4b2a.wav'),
# answer="Eddard",
# answer="Daenerys Targaryen",
# context_audio=PosixPath('generated_audio_answers/8c562ebd7e7f41e1f9208384957df173.wav'),
# context='...'
# type='extractive', score=0.9919578731060028,
@ -197,7 +228,7 @@ pprint(prediction)
# document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '43_Arya_Stark.txt'}} >,
# <SpeechAnswer:
# answer_audio=PosixPath('generated_audio_answers/07d6265486b22356362387c5a098ba7d.wav'),
# answer="Ned",
# answer="Daenerys",
# context_audio=PosixPath('generated_audio_answers/3f1ca228d6c4cfb633e55f89e97de7ac.wav'),
# context='...'
# type='extractive', score=0.9767240881919861,
@ -208,17 +239,17 @@ pprint(prediction)
# 'documents': [ <SpeechDocument:
# content_type='text', score=0.8034909798951382, meta={'name': '43_Arya_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe',
# content_audio=PosixPath('generated_audio_documents/07d6265486b22356362387c5a098ba7d.wav'),
# content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
# content='The title of the episode refers to both Daenerys Targaryen, who was born during a ...'>,
# <SpeechDocument:
# content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2',
# content_audio=PosixPath('generated_audio_documents/07d6265486b22356362387c5a098ba7d.wav'),
# content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
# content='"Stormborn" received praise from critics, who considered Euron Greyjoy's raid on ...'>,
# ...
# ],
# 'no_ans_gap': 11.688868522644043,
# 'node_id': 'Reader',
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
# 'query': 'Who is the father of Arya Stark?',
# 'query': 'Who was born during a storm?',
# 'root_node': 'Query'
# }
```
@ -231,27 +262,48 @@ from haystack.utils import print_answers
# Change `minimum` to `medium` or `all` to raise the level of detail
print_answers(prediction, details="minimum")
# Sample output:
#
# Query: Who is the father of Arya Stark?
# Query: Who was born during a storm
# Answers:
# [ { 'answer_audio': PosixPath('generated_audio_answers/07d6265486b22356362387c5a098ba7d.wav'),
# 'answer': 'Eddard',
# 'answer': 'Daenerys Targaryen',
# 'context_transcript': PosixPath('generated_audio_answers/3f1ca228d6c4cfb633e55f89e97de7ac.wav'),
# 'context': ' role of Arya Stark in the television series. '
# 'Arya accompanies her father Eddard and her sister '
# 'Sansa to King's Landing. Before their departure, Arya's h'},
# 'context': ' refers to both Daenerys Targaryen, who was born during a terrible storm, and '},
# { 'answer_audio': PosixPath('generated_audio_answers/83c3a02141cac4caffe0718cfd6c405c.wav'),
# 'answer': 'Lord Eddard Stark',
# 'answer': 'Daenerys',
# 'context_audio': PosixPath('generated_audio_answers/8c562ebd7e7f41e1f9208384957df173.wav'),
# 'context': 'ark daughters. During the Tourney of the Hand '
# 'to honour her father Lord Eddard Stark, Sansa '
# 'Stark is enchanted by the knights performing in '
# 'the event.'},
# 'context': 'The title of the episode refers to both Daenerys Targaryen, who was born during a terrible storm'},
# ...
```
```python
# The document the first answer was extracted from
original_document = [doc for doc in prediction["documents"] if doc.id == prediction["answers"][0].document_id][0]
pprint(original_document)
# Sample output
#
# <Document: {
# 'content': '"'''Stormborn'''" is the second episode of the seventh season of HBO's fantasy television
# series ''Game of Thrones'', and the 62nd overall. The episode was written by Bryan Cogman,
# and directed by Mark Mylod. The title of the episode refers to both Daenerys Targaryen,
# who was born during a terrible storm, and Euron Greyjoy, who declares himself to be "the storm".',
# 'content_type': 'audio',
# 'score': 0.6269117688771539,
# 'embedding': None,
# 'id': '9352f650b36f93ab99684fd4746af5c1'
# 'meta': {
# 'content_audio': '/home/sara/work/haystack/generated_audio_documents/2c9223d47801b0918f2db2ad778c3d5a.wav',
# 'type': 'generative',
# '_split_id': 19,
# 'audio_format': 'wav',
# 'sample_rate': 22050,
# 'name': '2_Stormborn.txt'}
# }>
```
### Hear them out!

File diff suppressed because it is too large Load Diff

View File

@ -41,7 +41,7 @@ def tutorial17_audio_features():
# Note: In this example, we're going to use only one text file from the wiki, as the DocumentToSpeech node is relatively slow
# on CPU machines. Comment out this line to use all documents from the dataset if you machine is powerful enough.
file_paths = [p for p in file_paths if "Arya_Stark" in p.name]
file_paths = [p for p in file_paths if "Stormborn" in p.name]
# Prepare some basic metadata for the files
files_metadata = [{"name": path.name} for path in file_paths]
@ -91,10 +91,10 @@ def tutorial17_audio_features():
# Sample output:
#
# <Document: {
# 'content': "\n\n'''Arya Stark''' is a fictional character in American author George R. R. Martin's ''A Song of Ice and Fire'' epic fantasy novel series.
# She is a prominent point of view character in the novels with the third most viewpoint chapters, and is the only viewpoint character to have appeared in every published
# book of the series. Introduced in 1996's ''A Game of Thrones'', Arya is the third child and younger daughter of Lord Eddard Stark and his wife Lady Catelyn Stark. She is tomboyish,
# headstrong, feisty, independent, disdains traditional female pursuits, and is often mistaken for a boy.",
# 'content': "'Stormborn' received praise from critics, who considered Euron Greyjoy's raid on Yara's Iron Fleet,
# the assembly of Daenerys' allies at Dragonstone, and Arya's reunion with her direwolf Nymeria as
# highlights of the episode. In the United States, it achieved a viewership of 9.27 million in its
# initial broadcast.",
# 'content_type': 'audio',
# 'score': None,
# 'meta': {
@ -104,7 +104,7 @@ def tutorial17_audio_features():
# '_split_id': 0,
# 'audio_format': 'wav',
# 'sample_rate': 22050,
# 'name': '43_Arya_Stark.txt'},
# 'name': '2_Stormborn.txt'},
# 'embedding': None,
# 'id': '2733e698301f8f94eb70430b874177fd'
# }>
@ -129,7 +129,7 @@ def tutorial17_audio_features():
audio_pipeline.add_node(answer2speech, name="AnswerToSpeech", inputs=["Reader"])
prediction = audio_pipeline.run(
query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
query="Who was born during a storm?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)
# Now you can either print the object directly
@ -140,7 +140,7 @@ def tutorial17_audio_features():
# {
# 'answers': [ <SpeechAnswer:
# answer_audio=PosixPath('generated_audio_answers/fc704210136643b833515ba628eb4b2a.wav'),
# answer="Eddard",
# answer="Daenerys Targaryen",
# context_audio=PosixPath('generated_audio_answers/8c562ebd7e7f41e1f9208384957df173.wav'),
# context='...'
# type='extractive', score=0.9919578731060028,
@ -148,7 +148,7 @@ def tutorial17_audio_features():
# document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '43_Arya_Stark.txt'}} >,
# <SpeechAnswer:
# answer_audio=PosixPath('generated_audio_answers/07d6265486b22356362387c5a098ba7d.wav'),
# answer="Ned",
# answer="Daenerys",
# context_audio=PosixPath('generated_audio_answers/3f1ca228d6c4cfb633e55f89e97de7ac.wav'),
# context='...'
# type='extractive', score=0.9767240881919861,
@ -159,17 +159,17 @@ def tutorial17_audio_features():
# 'documents': [ <SpeechDocument:
# content_type='text', score=0.8034909798951382, meta={'name': '43_Arya_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe',
# content_audio=PosixPath('generated_audio_documents/07d6265486b22356362387c5a098ba7d.wav'),
# content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
# content='The title of the episode refers to both Daenerys Targaryen, who was born during a ...'>,
# <SpeechDocument:
# content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2',
# content_audio=PosixPath('generated_audio_documents/07d6265486b22356362387c5a098ba7d.wav'),
# content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
# content='"Stormborn" received praise from critics, who considered Euron Greyjoy's raid on ...'>,
# ...
# ],
# 'no_ans_gap': 11.688868522644043,
# 'node_id': 'Reader',
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
# 'query': 'Who is the father of Arya Stark?',
# 'query': 'Who was born during a storm?',
# 'root_node': 'Query'
# }
@ -180,36 +180,29 @@ def tutorial17_audio_features():
# Sample output:
#
# Query: Who is the father of Arya Stark?
# Query: Who was born during a storm
# Answers:
# [ { 'answer_audio': PosixPath('generated_audio_answers/07d6265486b22356362387c5a098ba7d.wav'),
# 'answer': 'Eddard',
# 'answer': 'Daenerys Targaryen',
# 'context_transcript': PosixPath('generated_audio_answers/3f1ca228d6c4cfb633e55f89e97de7ac.wav'),
# 'context': ' role of Arya Stark in the television series. '
# 'Arya accompanies her father Eddard and her sister '
# 'Sansa to King's Landing. Before their departure, Arya's h'},
# 'context': ' refers to both Daenerys Targaryen, who was born during a terrible storm, and '},
# { 'answer_audio': PosixPath('generated_audio_answers/83c3a02141cac4caffe0718cfd6c405c.wav'),
# 'answer': 'Lord Eddard Stark',
# 'answer': 'Daenerys',
# 'context_audio': PosixPath('generated_audio_answers/8c562ebd7e7f41e1f9208384957df173.wav'),
# 'context': 'ark daughters. During the Tourney of the Hand '
# 'to honour her father Lord Eddard Stark, Sansa '
# 'Stark is enchanted by the knights performing in '
# 'the event.'},
# 'context': 'The title of the episode refers to both Daenerys Targaryen, who was born during a terrible storm'},
# ...
# The document the first answer was extracted from
# The document the first answer was extracted from
original_document = [doc for doc in prediction["documents"] if doc.id == prediction["answers"][0].document_id][0]
pprint(original_document)
# Sample output
#
# <Document: {
# 'content': '== Storylines ==\n=== Novels ===\n==== \'\'A Game of Thrones\'\' ====\nCoat of arms of House Stark\n\n
# Arya adopts a direwolf cub, which she names Nymeria after a legendary warrior queen. She travels with
# her father, Eddard, to King\'s Landing when he is made Hand of the King. Before she leaves, her
# half-brother Jon Snow has a smallsword made for her as a parting gift, which she names "Needle" after
# her least favorite ladylike activity. While taking a walk together, Prince Joffrey and her sister Sansa
# happen upon Arya and her friend, the low-born butcher apprentice Mycah, sparring in the woods with broomsticks.',
# 'content': '"'''Stormborn'''" is the second episode of the seventh season of HBO's fantasy television
# series ''Game of Thrones'', and the 62nd overall. The episode was written by Bryan Cogman,
# and directed by Mark Mylod. The title of the episode refers to both Daenerys Targaryen,
# who was born during a terrible storm, and Euron Greyjoy, who declares himself to be "the storm".',
# 'content_type': 'audio',
# 'score': 0.6269117688771539,
# 'embedding': None,
@ -220,13 +213,14 @@ def tutorial17_audio_features():
# '_split_id': 19,
# 'audio_format': 'wav',
# 'sample_rate': 22050,
# 'name': '43_Arya_Stark.txt'}
# 'name': '2_Stormborn.txt'}
# }>
if __name__ == "__main__":
tutorial17_audio_features()
# This Haystack script was made with love by deepset in Berlin, Germany
# Haystack: https://github.com/deepset-ai/haystack
# deepset: https://deepset.ai/

View File

@ -1,10 +1,20 @@
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import BM25Retriever, DensePassageRetriever, EmbeddingRetriever, FARMReader, PreProcessor
import logging
import tempfile
from pathlib import Path
from haystack.document_stores import ElasticsearchDocumentStore, InMemoryDocumentStore
from haystack.pipelines import Pipeline, ExtractiveQAPipeline, DocumentSearchPipeline
from haystack.nodes import (
BM25Retriever,
DensePassageRetriever,
EmbeddingRetriever,
FARMReader,
PreProcessor,
TextConverter,
)
from haystack.utils import fetch_archive_from_http, launch_es
from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline
from haystack.schema import Answer, Document, EvaluationResult, Label, MultiLabel, Span
import logging
logger = logging.getLogger(__name__)
@ -62,7 +72,6 @@ def tutorial5_evaluation():
)
# Initialize Retriever
from haystack.nodes import BM25Retriever
retriever = BM25Retriever(document_store=document_store)
@ -277,11 +286,6 @@ def tutorial5_evaluation():
# ### Preprocessing the dataset
# Preprocessing the dataset works a bit differently than before. Instead of directly generating documents (and labels) out of a SQuAD file, we first save them to disk. This is necessary to experiment with different indexing pipelines.
import tempfile
from pathlib import Path
from haystack.nodes import PreProcessor
from haystack.document_stores import InMemoryDocumentStore
document_store = InMemoryDocumentStore()
label_preprocessor = PreProcessor(
@ -323,10 +327,6 @@ def tutorial5_evaluation():
# In this experiment we evaluate extractive QA pipelines with two different retrievers on the evaluation set given the corpus:
# **ElasticsearchRetriever vs. EmbeddingRetriever**
from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader, TextConverter
from haystack.pipelines import Pipeline
from haystack.document_stores import ElasticsearchDocumentStore
# helper function to create query and index pipeline
def create_pipelines(document_store, preprocessor, retriever, reader):
query_pipeline = Pipeline()