mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-26 14:38:36 +00:00
Fix Tutorials and Tutorials (nightly) (#2737)
* Remove caching and install audio deps * Fix `Tutorials` as well * Run all tutorials even though some fail * Forgot fi * fix failure condition * proper bash string equality * Enable debug logs * remove audio files * Update Documentation & Code Style * Use the setup action in the Tutorial CI as well * Try with a file that exists * Update Documentation & Code Style * Fix the comments in the tutorials * Update Documentation & Code Style * Fix tutorials.sh * Remove debug logging * import pprint and try editable install * Update Documentation & Code Style * extract no run list * Add tutorial18 to no run list nightly * import pprint correctly * Update Documentation & Code Style * try making site-packages editable * Make pythonpath editable every time Tut17 is run on CI * typo * fix imports in tut5 * add git clean * Update Documentation & Code Style * add comments and remove` -e` * accidentally deleted a line * Update .github/utils/tutorials.sh Co-authored-by: Massimiliano Pippi <mpippi@gmail.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Massimiliano Pippi <mpippi@gmail.com>
This commit is contained in:
parent
4d8f40425b
commit
091711b8c4
44
.github/utils/tutorials.sh
vendored
44
.github/utils/tutorials.sh
vendored
@ -2,11 +2,11 @@
|
||||
|
||||
export LAUNCH_GRAPHDB=0 # See tut 10 - GraphDB is already running in CI
|
||||
export TIKA_LOG_PATH=$PWD # Avoid permission denied errors while importing tika
|
||||
set -e # Fails on any error in the following loop
|
||||
|
||||
python_path=$1
|
||||
files_changed=$2
|
||||
exclusion_list=$3
|
||||
make_python_path_editable=$4
|
||||
no_got_tutorials='4_FAQ_style_QA 5_Evaluation 7_RAG_Generator 8_Preprocessing 10_Knowledge_Graph 15_TableQA 16_Document_Classifier_at_Index_Time'
|
||||
|
||||
echo "Files changed in this PR: $files_changed"
|
||||
@ -33,6 +33,7 @@ for script in $files_changed; do
|
||||
scripts_to_run="$scripts_to_run $script"
|
||||
done
|
||||
|
||||
failed=""
|
||||
for script in $scripts_to_run; do
|
||||
|
||||
echo ""
|
||||
@ -61,15 +62,54 @@ for script in $scripts_to_run; do
|
||||
echo "NOT using reduced GoT dataset!"
|
||||
fi
|
||||
|
||||
# FIXME Make the Python path editable
|
||||
# espnet needs to edit files on the PYTHONPATH during execution. However, by default GH runners don't allow
|
||||
# workflows to edit files into that directory, so in case of tutorials using espnet, we need to make PYTHONPATH
|
||||
# editable first. For now it's only Tutorial 17.
|
||||
# Still unclear why it's needed to repeat this operation, but if Tutorial 17 is run twice (once for the .py
|
||||
# and once for .ipynb version) the error re-appears.
|
||||
if [[ $make_python_path_editable == "EDITABLE" ]] && [[ "$script" == *"Tutorial17_"* ]]; then
|
||||
sudo find $python_path/lib -type f -exec chmod 777 {} \;
|
||||
fi
|
||||
|
||||
if [[ "$script" == *".py" ]]; then
|
||||
time python $script
|
||||
else
|
||||
sudo $python_path/bin/ipython -c "%run $script"
|
||||
fi
|
||||
|
||||
if [ ! $? -eq 0 ]; then
|
||||
failed=$failed" "$script
|
||||
fi
|
||||
|
||||
# Clean up datasets and SQLite DBs to avoid crashing the next tutorial
|
||||
git clean -f
|
||||
|
||||
done
|
||||
|
||||
# causes permission errors on Post Cache
|
||||
sudo rm -rf data/
|
||||
sudo rm -rf /home/runner/work/haystack/haystack/elasticsearch-7.9.2/
|
||||
sudo rm -rf /home/runner/work/haystack/haystack/elasticsearch-7.9.2/
|
||||
|
||||
|
||||
if [[ $failed == "" ]]; then
|
||||
echo ""
|
||||
echo ""
|
||||
echo "------------------------------------------"
|
||||
echo " All tutorials were executed successfully "
|
||||
echo "------------------------------------------"
|
||||
exit 0
|
||||
|
||||
else
|
||||
echo ""
|
||||
echo "##################################################################################"
|
||||
echo "## ##"
|
||||
echo "## Some tutorials have failed! ##"
|
||||
echo "## ##"
|
||||
echo "##################################################################################"
|
||||
for script in $failed; do
|
||||
echo "## - $script"
|
||||
done
|
||||
echo "##################################################################################"
|
||||
exit 1
|
||||
fi
|
||||
36
.github/workflows/tutorials.yml
vendored
36
.github/workflows/tutorials.yml
vendored
@ -7,6 +7,11 @@ on:
|
||||
- 'tutorials/*.*'
|
||||
|
||||
|
||||
env:
|
||||
# Tutorials that require a GPU to run, so can't be run on CI without self-hosted runners
|
||||
DONT_RUN: Tutorial2_ Tutorial9_ Tutorial13_ Tutorial18_
|
||||
|
||||
|
||||
jobs:
|
||||
|
||||
run:
|
||||
@ -14,21 +19,9 @@ jobs:
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
|
||||
|
||||
- name: Set up Python 3.7
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.7
|
||||
|
||||
- name: Cache Python
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.pythonLocation }}
|
||||
key: linux-${{ env.date }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/setup.cfg') }}-${{ hashFiles('**/pyproject.toml') }}
|
||||
- name: Setup Python
|
||||
uses: ./.github/actions/python_cache/
|
||||
|
||||
- name: Run Elasticsearch
|
||||
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
|
||||
@ -42,8 +35,16 @@ jobs:
|
||||
- name: Install pdftotext
|
||||
run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
|
||||
|
||||
- name: Install graphviz
|
||||
run: sudo apt install libgraphviz-dev graphviz
|
||||
# Some tutorials require these libraries to run
|
||||
# - Tutorial 17 for the audio libs
|
||||
# - Tutorial 11, 14, 15, 16 for pygraphviz
|
||||
- name: Install graphviz and audio libs
|
||||
run: sudo apt install libgraphviz-dev graphviz libsndfile1 ffmpeg
|
||||
|
||||
# Some tutorials require these libraries to run
|
||||
# - Tutorial 15
|
||||
- name: Install torch-scatter
|
||||
run: pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
|
||||
|
||||
# Haystack needs to be reinstalled at this stage to make sure the current commit's version is the one getting tested.
|
||||
# The cache can last way longer than a specific action's run, so older Haystack version could be carried over.
|
||||
@ -51,7 +52,6 @@ jobs:
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install .[all]
|
||||
pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
|
||||
pip install pygraphviz
|
||||
pip install ipython nbformat
|
||||
|
||||
@ -71,4 +71,4 @@ jobs:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Run tutorials
|
||||
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "${{ steps.diff.outputs.added_modified }}" "Tutorial2_ Tutorial9_ Tutorial13_ Tutorial18_"
|
||||
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "${{ steps.diff.outputs.added_modified }}" "${{ env.DONT_RUN }}" "EDITABLE"
|
||||
|
||||
59
.github/workflows/tutorials_nightly.yml
vendored
59
.github/workflows/tutorials_nightly.yml
vendored
@ -6,6 +6,11 @@ on:
|
||||
- cron: '0 0 * * *'
|
||||
|
||||
|
||||
env:
|
||||
# Tutorials that require a GPU to run, so can't be run on CI without self-hosted runners
|
||||
DONT_RUN: Tutorial2_ Tutorial9_ Tutorial13_ Tutorial18_
|
||||
|
||||
|
||||
jobs:
|
||||
|
||||
notebooks:
|
||||
@ -13,19 +18,12 @@ jobs:
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
|
||||
|
||||
- name: Set up Python 3.7
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.7
|
||||
|
||||
- name: Cache Python
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.pythonLocation }}
|
||||
key: linux-${{ env.date }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/setup.cfg') }}-${{ hashFiles('**/pyproject.toml') }}
|
||||
|
||||
- name: Run Elasticsearch
|
||||
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
|
||||
|
||||
@ -38,16 +36,21 @@ jobs:
|
||||
- name: Install pdftotext
|
||||
run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
|
||||
|
||||
- name: Install graphviz
|
||||
run: sudo apt install libgraphviz-dev graphviz
|
||||
# Some tutorials require these libraries to run
|
||||
# - Tutorial 17 for the audio libs
|
||||
# - Tutorial 11, 14, 15, 16 for pygraphviz
|
||||
- name: Install graphviz and audio libs
|
||||
run: sudo apt install libgraphviz-dev graphviz libsndfile1 ffmpeg
|
||||
|
||||
# Some tutorials require these libraries to run
|
||||
# - Tutorial 15
|
||||
- name: Install torch-scatter
|
||||
run: pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
|
||||
|
||||
# Haystack needs to be reinstalled at this stage to make sure the current commit's version is the one getting tested.
|
||||
# The cache can last way longer than a specific action's run, so older Haystack version could be carried over.
|
||||
- name: Reinstall Haystack
|
||||
- name: Install Haystack
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install .[all]
|
||||
pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
|
||||
pip install pygraphviz
|
||||
pip install ipython nbformat
|
||||
|
||||
@ -60,7 +63,7 @@ jobs:
|
||||
rm wiki_gameofthrones_txt1_mini.zip
|
||||
|
||||
- name: Run tutorials
|
||||
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "tutorials/*.ipynb" "Tutorial2_ Tutorial9_ Tutorial13_"
|
||||
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "tutorials/*.ipynb" "${{ env.DONT_RUN }}"
|
||||
|
||||
|
||||
scripts:
|
||||
@ -68,19 +71,12 @@ jobs:
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_ENV
|
||||
|
||||
- name: Set up Python 3.7
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.7
|
||||
|
||||
- name: Cache Python
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ${{ env.pythonLocation }}
|
||||
key: linux-${{ env.date }}-${{ hashFiles('**/setup.py') }}-${{ hashFiles('**/setup.cfg') }}-${{ hashFiles('**/pyproject.toml') }}
|
||||
|
||||
- name: Run Elasticsearch
|
||||
run: docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2
|
||||
|
||||
@ -92,17 +88,22 @@ jobs:
|
||||
|
||||
- name: Install pdftotext
|
||||
run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
|
||||
|
||||
# Some tutorials require these libraries to run
|
||||
# - Tutorial 17 for the audio libs
|
||||
# - Tutorial 11, 14, 15, 16 for pygraphviz
|
||||
- name: Install graphviz and audio libs
|
||||
run: sudo apt install libgraphviz-dev graphviz libsndfile1 ffmpeg
|
||||
|
||||
# Some tutorials require these libraries to run
|
||||
# - Tutorial 15
|
||||
- name: Install torch-scatter
|
||||
run: pip install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
|
||||
|
||||
- name: Install graphviz
|
||||
run: sudo apt install libgraphviz-dev graphviz
|
||||
|
||||
# Haystack needs to be reinstalled at this stage to make sure the current commit's version is the one getting tested.
|
||||
# The cache can last way longer than a specific action's run, so older Haystack version could be carried over.
|
||||
- name: Reinstall Haystack
|
||||
- name: Install Haystack
|
||||
run: |
|
||||
pip install --upgrade pip
|
||||
pip install .[all]
|
||||
pip install torch-scatter -f https://data.pyg.org/whl/torch-1.11.0+cpu.html
|
||||
pip install pygraphviz
|
||||
pip install ipython nbformat
|
||||
|
||||
@ -115,4 +116,4 @@ jobs:
|
||||
rm wiki_gameofthrones_txt1_mini.zip
|
||||
|
||||
- name: Run tutorials
|
||||
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "tutorials/*.py" "Tutorial2_ Tutorial9_ Tutorial13_"
|
||||
run: ./.github/utils/tutorials.sh ${{ env.pythonLocation }} "tutorials/*.py" "${{ env.DONT_RUN }}"
|
||||
|
||||
@ -97,7 +97,7 @@ file_paths = [p for p in Path(documents_path).glob("**/*")]
|
||||
|
||||
# NOTE: In this example we're going to use only one text file from the wiki, as the DocumentToSpeech node is quite slow
|
||||
# on CPU machines. Comment out this line to use all documents from the dataset if you machine is powerful enough.
|
||||
file_paths = [p for p in file_paths if "Arya_Stark" in p.name]
|
||||
file_paths = [p for p in file_paths if "Stormborn" in p.name]
|
||||
|
||||
# Prepare some basic metadata for the files
|
||||
files_metadata = [{"name": path.name} for path in file_paths]
|
||||
@ -144,6 +144,37 @@ indexing_pipeline.add_node(document_store, name="document_store", inputs=["doc2s
|
||||
output = indexing_pipeline.run(file_paths=file_paths, meta=files_metadata)
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
from pprint import pprint
|
||||
|
||||
# You can now check the document store and verify that documents have been enriched with a path
|
||||
# to the generated audio file
|
||||
document = next(document_store.get_all_documents_generator())
|
||||
pprint(document)
|
||||
|
||||
# Sample output:
|
||||
#
|
||||
# <Document: {
|
||||
# 'content': "'Stormborn' received praise from critics, who considered Euron Greyjoy's raid on Yara's Iron Fleet,
|
||||
# the assembly of Daenerys' allies at Dragonstone, and Arya's reunion with her direwolf Nymeria as
|
||||
# highlights of the episode. In the United States, it achieved a viewership of 9.27 million in its
|
||||
# initial broadcast.",
|
||||
# 'content_type': 'audio',
|
||||
# 'score': None,
|
||||
# 'meta': {
|
||||
# 'content_audio': './generated_audio_documents/f218707624d9c4f9487f508e4603bf5b.wav',
|
||||
# '__initialised__': True,
|
||||
# 'type': 'generative',
|
||||
# '_split_id': 0,
|
||||
# 'audio_format': 'wav',
|
||||
# 'sample_rate': 22050,
|
||||
# 'name': '2_Stormborn.txt'},
|
||||
# 'embedding': None,
|
||||
# 'id': '2733e698301f8f94eb70430b874177fd'
|
||||
# }>
|
||||
```
|
||||
|
||||
### Querying
|
||||
|
||||
Now we will create a pipeline very similar to the basic `ExtractiveQAPipeline` of Tutorial 1,
|
||||
@ -189,7 +220,7 @@ pprint(prediction)
|
||||
# {
|
||||
# 'answers': [ <SpeechAnswer:
|
||||
# answer_audio=PosixPath('generated_audio_answers/fc704210136643b833515ba628eb4b2a.wav'),
|
||||
# answer="Eddard",
|
||||
# answer="Daenerys Targaryen",
|
||||
# context_audio=PosixPath('generated_audio_answers/8c562ebd7e7f41e1f9208384957df173.wav'),
|
||||
# context='...'
|
||||
# type='extractive', score=0.9919578731060028,
|
||||
@ -197,7 +228,7 @@ pprint(prediction)
|
||||
# document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '43_Arya_Stark.txt'}} >,
|
||||
# <SpeechAnswer:
|
||||
# answer_audio=PosixPath('generated_audio_answers/07d6265486b22356362387c5a098ba7d.wav'),
|
||||
# answer="Ned",
|
||||
# answer="Daenerys",
|
||||
# context_audio=PosixPath('generated_audio_answers/3f1ca228d6c4cfb633e55f89e97de7ac.wav'),
|
||||
# context='...'
|
||||
# type='extractive', score=0.9767240881919861,
|
||||
@ -208,17 +239,17 @@ pprint(prediction)
|
||||
# 'documents': [ <SpeechDocument:
|
||||
# content_type='text', score=0.8034909798951382, meta={'name': '43_Arya_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe',
|
||||
# content_audio=PosixPath('generated_audio_documents/07d6265486b22356362387c5a098ba7d.wav'),
|
||||
# content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
|
||||
# content='The title of the episode refers to both Daenerys Targaryen, who was born during a ...'>,
|
||||
# <SpeechDocument:
|
||||
# content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2',
|
||||
# content_audio=PosixPath('generated_audio_documents/07d6265486b22356362387c5a098ba7d.wav'),
|
||||
# content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
|
||||
# content='"Stormborn" received praise from critics, who considered Euron Greyjoy's raid on ...'>,
|
||||
# ...
|
||||
# ],
|
||||
# 'no_ans_gap': 11.688868522644043,
|
||||
# 'node_id': 'Reader',
|
||||
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
|
||||
# 'query': 'Who is the father of Arya Stark?',
|
||||
# 'query': 'Who was born during a storm?',
|
||||
# 'root_node': 'Query'
|
||||
# }
|
||||
```
|
||||
@ -231,27 +262,48 @@ from haystack.utils import print_answers
|
||||
# Change `minimum` to `medium` or `all` to raise the level of detail
|
||||
print_answers(prediction, details="minimum")
|
||||
|
||||
|
||||
# Sample output:
|
||||
#
|
||||
# Query: Who is the father of Arya Stark?
|
||||
# Query: Who was born during a storm
|
||||
# Answers:
|
||||
# [ { 'answer_audio': PosixPath('generated_audio_answers/07d6265486b22356362387c5a098ba7d.wav'),
|
||||
# 'answer': 'Eddard',
|
||||
# 'answer': 'Daenerys Targaryen',
|
||||
# 'context_transcript': PosixPath('generated_audio_answers/3f1ca228d6c4cfb633e55f89e97de7ac.wav'),
|
||||
# 'context': ' role of Arya Stark in the television series. '
|
||||
# 'Arya accompanies her father Eddard and her sister '
|
||||
# 'Sansa to King's Landing. Before their departure, Arya's h'},
|
||||
# 'context': ' refers to both Daenerys Targaryen, who was born during a terrible storm, and '},
|
||||
# { 'answer_audio': PosixPath('generated_audio_answers/83c3a02141cac4caffe0718cfd6c405c.wav'),
|
||||
# 'answer': 'Lord Eddard Stark',
|
||||
# 'answer': 'Daenerys',
|
||||
# 'context_audio': PosixPath('generated_audio_answers/8c562ebd7e7f41e1f9208384957df173.wav'),
|
||||
# 'context': 'ark daughters. During the Tourney of the Hand '
|
||||
# 'to honour her father Lord Eddard Stark, Sansa '
|
||||
# 'Stark is enchanted by the knights performing in '
|
||||
# 'the event.'},
|
||||
# 'context': 'The title of the episode refers to both Daenerys Targaryen, who was born during a terrible storm'},
|
||||
# ...
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# The document the first answer was extracted from
|
||||
original_document = [doc for doc in prediction["documents"] if doc.id == prediction["answers"][0].document_id][0]
|
||||
pprint(original_document)
|
||||
|
||||
# Sample output
|
||||
#
|
||||
# <Document: {
|
||||
# 'content': '"'''Stormborn'''" is the second episode of the seventh season of HBO's fantasy television
|
||||
# series ''Game of Thrones'', and the 62nd overall. The episode was written by Bryan Cogman,
|
||||
# and directed by Mark Mylod. The title of the episode refers to both Daenerys Targaryen,
|
||||
# who was born during a terrible storm, and Euron Greyjoy, who declares himself to be "the storm".',
|
||||
# 'content_type': 'audio',
|
||||
# 'score': 0.6269117688771539,
|
||||
# 'embedding': None,
|
||||
# 'id': '9352f650b36f93ab99684fd4746af5c1'
|
||||
# 'meta': {
|
||||
# 'content_audio': '/home/sara/work/haystack/generated_audio_documents/2c9223d47801b0918f2db2ad778c3d5a.wav',
|
||||
# 'type': 'generative',
|
||||
# '_split_id': 19,
|
||||
# 'audio_format': 'wav',
|
||||
# 'sample_rate': 22050,
|
||||
# 'name': '2_Stormborn.txt'}
|
||||
# }>
|
||||
```
|
||||
|
||||
### Hear them out!
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -41,7 +41,7 @@ def tutorial17_audio_features():
|
||||
|
||||
# Note: In this example, we're going to use only one text file from the wiki, as the DocumentToSpeech node is relatively slow
|
||||
# on CPU machines. Comment out this line to use all documents from the dataset if you machine is powerful enough.
|
||||
file_paths = [p for p in file_paths if "Arya_Stark" in p.name]
|
||||
file_paths = [p for p in file_paths if "Stormborn" in p.name]
|
||||
|
||||
# Prepare some basic metadata for the files
|
||||
files_metadata = [{"name": path.name} for path in file_paths]
|
||||
@ -91,10 +91,10 @@ def tutorial17_audio_features():
|
||||
# Sample output:
|
||||
#
|
||||
# <Document: {
|
||||
# 'content': "\n\n'''Arya Stark''' is a fictional character in American author George R. R. Martin's ''A Song of Ice and Fire'' epic fantasy novel series.
|
||||
# She is a prominent point of view character in the novels with the third most viewpoint chapters, and is the only viewpoint character to have appeared in every published
|
||||
# book of the series. Introduced in 1996's ''A Game of Thrones'', Arya is the third child and younger daughter of Lord Eddard Stark and his wife Lady Catelyn Stark. She is tomboyish,
|
||||
# headstrong, feisty, independent, disdains traditional female pursuits, and is often mistaken for a boy.",
|
||||
# 'content': "'Stormborn' received praise from critics, who considered Euron Greyjoy's raid on Yara's Iron Fleet,
|
||||
# the assembly of Daenerys' allies at Dragonstone, and Arya's reunion with her direwolf Nymeria as
|
||||
# highlights of the episode. In the United States, it achieved a viewership of 9.27 million in its
|
||||
# initial broadcast.",
|
||||
# 'content_type': 'audio',
|
||||
# 'score': None,
|
||||
# 'meta': {
|
||||
@ -104,7 +104,7 @@ def tutorial17_audio_features():
|
||||
# '_split_id': 0,
|
||||
# 'audio_format': 'wav',
|
||||
# 'sample_rate': 22050,
|
||||
# 'name': '43_Arya_Stark.txt'},
|
||||
# 'name': '2_Stormborn.txt'},
|
||||
# 'embedding': None,
|
||||
# 'id': '2733e698301f8f94eb70430b874177fd'
|
||||
# }>
|
||||
@ -129,7 +129,7 @@ def tutorial17_audio_features():
|
||||
audio_pipeline.add_node(answer2speech, name="AnswerToSpeech", inputs=["Reader"])
|
||||
|
||||
prediction = audio_pipeline.run(
|
||||
query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
|
||||
query="Who was born during a storm?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
|
||||
)
|
||||
|
||||
# Now you can either print the object directly
|
||||
@ -140,7 +140,7 @@ def tutorial17_audio_features():
|
||||
# {
|
||||
# 'answers': [ <SpeechAnswer:
|
||||
# answer_audio=PosixPath('generated_audio_answers/fc704210136643b833515ba628eb4b2a.wav'),
|
||||
# answer="Eddard",
|
||||
# answer="Daenerys Targaryen",
|
||||
# context_audio=PosixPath('generated_audio_answers/8c562ebd7e7f41e1f9208384957df173.wav'),
|
||||
# context='...'
|
||||
# type='extractive', score=0.9919578731060028,
|
||||
@ -148,7 +148,7 @@ def tutorial17_audio_features():
|
||||
# document_id='cc75f739897ecbf8c14657b13dda890e', meta={'name': '43_Arya_Stark.txt'}} >,
|
||||
# <SpeechAnswer:
|
||||
# answer_audio=PosixPath('generated_audio_answers/07d6265486b22356362387c5a098ba7d.wav'),
|
||||
# answer="Ned",
|
||||
# answer="Daenerys",
|
||||
# context_audio=PosixPath('generated_audio_answers/3f1ca228d6c4cfb633e55f89e97de7ac.wav'),
|
||||
# context='...'
|
||||
# type='extractive', score=0.9767240881919861,
|
||||
@ -159,17 +159,17 @@ def tutorial17_audio_features():
|
||||
# 'documents': [ <SpeechDocument:
|
||||
# content_type='text', score=0.8034909798951382, meta={'name': '43_Arya_Stark.txt'}, embedding=None, id=d1f36ec7170e4c46cde65787fe125dfe',
|
||||
# content_audio=PosixPath('generated_audio_documents/07d6265486b22356362387c5a098ba7d.wav'),
|
||||
# content='\n===\'\'A Game of Thrones\'\'===\nSansa Stark begins the novel by being betrothed to Crown ...'>,
|
||||
# content='The title of the episode refers to both Daenerys Targaryen, who was born during a ...'>,
|
||||
# <SpeechDocument:
|
||||
# content_type='text', score=0.8002150354529785, meta={'name': '191_Gendry.txt'}, embedding=None, id='dd4e070a22896afa81748d6510006d2',
|
||||
# content_audio=PosixPath('generated_audio_documents/07d6265486b22356362387c5a098ba7d.wav'),
|
||||
# content='\n===Season 2===\nGendry travels North with Yoren and other Night's Watch recruits, including Arya ...'>,
|
||||
# content='"Stormborn" received praise from critics, who considered Euron Greyjoy's raid on ...'>,
|
||||
# ...
|
||||
# ],
|
||||
# 'no_ans_gap': 11.688868522644043,
|
||||
# 'node_id': 'Reader',
|
||||
# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}},
|
||||
# 'query': 'Who is the father of Arya Stark?',
|
||||
# 'query': 'Who was born during a storm?',
|
||||
# 'root_node': 'Query'
|
||||
# }
|
||||
|
||||
@ -180,36 +180,29 @@ def tutorial17_audio_features():
|
||||
|
||||
# Sample output:
|
||||
#
|
||||
# Query: Who is the father of Arya Stark?
|
||||
# Query: Who was born during a storm
|
||||
# Answers:
|
||||
# [ { 'answer_audio': PosixPath('generated_audio_answers/07d6265486b22356362387c5a098ba7d.wav'),
|
||||
# 'answer': 'Eddard',
|
||||
# 'answer': 'Daenerys Targaryen',
|
||||
# 'context_transcript': PosixPath('generated_audio_answers/3f1ca228d6c4cfb633e55f89e97de7ac.wav'),
|
||||
# 'context': ' role of Arya Stark in the television series. '
|
||||
# 'Arya accompanies her father Eddard and her sister '
|
||||
# 'Sansa to King's Landing. Before their departure, Arya's h'},
|
||||
# 'context': ' refers to both Daenerys Targaryen, who was born during a terrible storm, and '},
|
||||
# { 'answer_audio': PosixPath('generated_audio_answers/83c3a02141cac4caffe0718cfd6c405c.wav'),
|
||||
# 'answer': 'Lord Eddard Stark',
|
||||
# 'answer': 'Daenerys',
|
||||
# 'context_audio': PosixPath('generated_audio_answers/8c562ebd7e7f41e1f9208384957df173.wav'),
|
||||
# 'context': 'ark daughters. During the Tourney of the Hand '
|
||||
# 'to honour her father Lord Eddard Stark, Sansa '
|
||||
# 'Stark is enchanted by the knights performing in '
|
||||
# 'the event.'},
|
||||
# 'context': 'The title of the episode refers to both Daenerys Targaryen, who was born during a terrible storm'},
|
||||
# ...
|
||||
# The document the first answer was extracted from
|
||||
|
||||
# The document the first answer was extracted from
|
||||
original_document = [doc for doc in prediction["documents"] if doc.id == prediction["answers"][0].document_id][0]
|
||||
pprint(original_document)
|
||||
|
||||
# Sample output
|
||||
#
|
||||
# <Document: {
|
||||
# 'content': '== Storylines ==\n=== Novels ===\n==== \'\'A Game of Thrones\'\' ====\nCoat of arms of House Stark\n\n
|
||||
# Arya adopts a direwolf cub, which she names Nymeria after a legendary warrior queen. She travels with
|
||||
# her father, Eddard, to King\'s Landing when he is made Hand of the King. Before she leaves, her
|
||||
# half-brother Jon Snow has a smallsword made for her as a parting gift, which she names "Needle" after
|
||||
# her least favorite ladylike activity. While taking a walk together, Prince Joffrey and her sister Sansa
|
||||
# happen upon Arya and her friend, the low-born butcher apprentice Mycah, sparring in the woods with broomsticks.',
|
||||
# 'content': '"'''Stormborn'''" is the second episode of the seventh season of HBO's fantasy television
|
||||
# series ''Game of Thrones'', and the 62nd overall. The episode was written by Bryan Cogman,
|
||||
# and directed by Mark Mylod. The title of the episode refers to both Daenerys Targaryen,
|
||||
# who was born during a terrible storm, and Euron Greyjoy, who declares himself to be "the storm".',
|
||||
# 'content_type': 'audio',
|
||||
# 'score': 0.6269117688771539,
|
||||
# 'embedding': None,
|
||||
@ -220,13 +213,14 @@ def tutorial17_audio_features():
|
||||
# '_split_id': 19,
|
||||
# 'audio_format': 'wav',
|
||||
# 'sample_rate': 22050,
|
||||
# 'name': '43_Arya_Stark.txt'}
|
||||
# 'name': '2_Stormborn.txt'}
|
||||
# }>
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tutorial17_audio_features()
|
||||
|
||||
|
||||
# This Haystack script was made with love by deepset in Berlin, Germany
|
||||
# Haystack: https://github.com/deepset-ai/haystack
|
||||
# deepset: https://deepset.ai/
|
||||
|
||||
@ -1,10 +1,20 @@
|
||||
from haystack.document_stores import ElasticsearchDocumentStore
|
||||
from haystack.nodes import BM25Retriever, DensePassageRetriever, EmbeddingRetriever, FARMReader, PreProcessor
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.document_stores import ElasticsearchDocumentStore, InMemoryDocumentStore
|
||||
from haystack.pipelines import Pipeline, ExtractiveQAPipeline, DocumentSearchPipeline
|
||||
from haystack.nodes import (
|
||||
BM25Retriever,
|
||||
DensePassageRetriever,
|
||||
EmbeddingRetriever,
|
||||
FARMReader,
|
||||
PreProcessor,
|
||||
TextConverter,
|
||||
)
|
||||
from haystack.utils import fetch_archive_from_http, launch_es
|
||||
from haystack.pipelines import ExtractiveQAPipeline, DocumentSearchPipeline
|
||||
from haystack.schema import Answer, Document, EvaluationResult, Label, MultiLabel, Span
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -62,7 +72,6 @@ def tutorial5_evaluation():
|
||||
)
|
||||
|
||||
# Initialize Retriever
|
||||
from haystack.nodes import BM25Retriever
|
||||
|
||||
retriever = BM25Retriever(document_store=document_store)
|
||||
|
||||
@ -277,11 +286,6 @@ def tutorial5_evaluation():
|
||||
# ### Preprocessing the dataset
|
||||
# Preprocessing the dataset works a bit differently than before. Instead of directly generating documents (and labels) out of a SQuAD file, we first save them to disk. This is necessary to experiment with different indexing pipelines.
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from haystack.nodes import PreProcessor
|
||||
from haystack.document_stores import InMemoryDocumentStore
|
||||
|
||||
document_store = InMemoryDocumentStore()
|
||||
|
||||
label_preprocessor = PreProcessor(
|
||||
@ -323,10 +327,6 @@ def tutorial5_evaluation():
|
||||
# In this experiment we evaluate extractive QA pipelines with two different retrievers on the evaluation set given the corpus:
|
||||
# **ElasticsearchRetriever vs. EmbeddingRetriever**
|
||||
|
||||
from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader, TextConverter
|
||||
from haystack.pipelines import Pipeline
|
||||
from haystack.document_stores import ElasticsearchDocumentStore
|
||||
|
||||
# helper function to create query and index pipeline
|
||||
def create_pipelines(document_store, preprocessor, retriever, reader):
|
||||
query_pipeline = Pipeline()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user