mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-08 13:06:29 +00:00
Fix tutorial dataset paths (#2340)
* fix tutorial 4 dataset path * fix tutorial 8 dataset path * fix tutorial 10 event * Update Documentation & Code Style * fix send event for tutorial 15 * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
ba1a85593b
commit
7ffeccece6
@ -26,7 +26,7 @@ This tutorial will show you how to integrate a classification model into your pr
|
||||
|
||||
# Install the latest master of Haystack
|
||||
!pip install --upgrade pip
|
||||
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
|
||||
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab, ocr]
|
||||
|
||||
!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz
|
||||
!tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin
|
||||
|
||||
@ -131,7 +131,7 @@ s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/docu
|
||||
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
|
||||
|
||||
# Get dataframe with columns "question", "answer" and some custom metadata
|
||||
df = pd.read_csv("small_faq_covid.csv")
|
||||
df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv")
|
||||
# Minimal cleaning
|
||||
df.fillna(value="", inplace=True)
|
||||
df["question"] = df["question"].apply(lambda x: x.strip())
|
||||
|
||||
@ -33,6 +33,8 @@ This tutorial will show you all the tools that Haystack provides to help you cas
|
||||
#! pip install farm-haystack
|
||||
|
||||
# Install the latest master of Haystack
|
||||
from pathlib import Path
|
||||
|
||||
!pip install --upgrade pip
|
||||
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]
|
||||
|
||||
@ -69,20 +71,20 @@ For converting PDFs, try changing the encoding to UTF-8 if the conversion isn't
|
||||
# Here are some examples of how you would use file converters
|
||||
|
||||
converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
||||
doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0]
|
||||
doc_txt = converter.convert(file_path=Path(f"{doc_dir}/classics.txt"), meta=None)[0]
|
||||
|
||||
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
||||
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0]
|
||||
doc_pdf = converter.convert(file_path=Path(f"{doc_dir}/bert.pdf"), meta=None)[0]
|
||||
|
||||
converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
|
||||
doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0]
|
||||
doc_docx = converter.convert(file_path=Path(f"{doc_dir}/heavy_metal.docx"), meta=None)[0]
|
||||
```
|
||||
|
||||
|
||||
```python
|
||||
# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.
|
||||
|
||||
all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial")
|
||||
all_docs = convert_files_to_dicts(dir_path=doc_dir)
|
||||
```
|
||||
|
||||
## PreProcessor
|
||||
|
||||
@ -190,7 +190,7 @@ def send_tutorial_event(url: str):
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip": "1",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/squad_small.json.zip": "2",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip": "3",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/faq_covidbert.csv.zip": "4",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/small_faq_covid.csv.zip": "4",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip": "5",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip": "6",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip": "7",
|
||||
@ -202,7 +202,7 @@ def send_tutorial_event(url: str):
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip": "12",
|
||||
# Tutorial 13: no dataset available yet
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip": "14",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_tables_sample.json.zip": "15",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip": "15",
|
||||
# "https://nlp.stanford.edu/data/glove.6B.zip": "16",
|
||||
"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip": "16",
|
||||
}
|
||||
|
||||
@ -69,7 +69,7 @@ def fetch_archive_from_http(url: str, output_dir: str, proxies: Optional[dict] =
|
||||
if not path.exists():
|
||||
path.mkdir(parents=True)
|
||||
|
||||
if "deepset.ai-farm-qa/datasets" in url or "dl.fbaipublicfiles.com" in url:
|
||||
if "deepset.ai-farm-qa/datasets" in url or "dl.fbaipublicfiles.com" in url or "fandom-qa.s3" in url:
|
||||
send_tutorial_event(url=url)
|
||||
|
||||
is_not_empty = len(list(Path(path).rglob("*"))) > 0
|
||||
|
||||
@ -46,7 +46,7 @@
|
||||
"\n",
|
||||
"# Install the latest master of Haystack\n",
|
||||
"!pip install --upgrade pip\n",
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]\n",
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab, ocr]\n",
|
||||
"\n",
|
||||
"!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.03.tar.gz\n",
|
||||
"!tar -xvf xpdf-tools-linux-4.03.tar.gz && sudo cp xpdf-tools-linux-4.03/bin64/pdftotext /usr/local/bin\n",
|
||||
|
||||
@ -225,7 +225,7 @@
|
||||
"fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n",
|
||||
"\n",
|
||||
"# Get dataframe with columns \"question\", \"answer\" and some custom metadata\n",
|
||||
"df = pd.read_csv(\"small_faq_covid.csv\")\n",
|
||||
"df = pd.read_csv(f\"{doc_dir}/small_faq_covid.csv\")\n",
|
||||
"# Minimal cleaning\n",
|
||||
"df.fillna(value=\"\", inplace=True)\n",
|
||||
"df[\"question\"] = df[\"question\"].apply(lambda x: x.strip())\n",
|
||||
|
||||
@ -58,7 +58,7 @@ def tutorial4_faq_style_qa():
|
||||
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
|
||||
|
||||
# Get dataframe with columns "question", "answer" and some custom metadata
|
||||
df = pd.read_csv("small_faq_covid.csv")
|
||||
df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv")
|
||||
# Minimal cleaning
|
||||
df.fillna(value="", inplace=True)
|
||||
df["question"] = df["question"].apply(lambda x: x.strip())
|
||||
|
||||
@ -62,6 +62,8 @@
|
||||
"#! pip install farm-haystack\n",
|
||||
"\n",
|
||||
"# Install the latest master of Haystack\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"!pip install --upgrade pip\n",
|
||||
"!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]\n",
|
||||
"\n",
|
||||
@ -163,13 +165,13 @@
|
||||
"# Here are some examples of how you would use file converters\n",
|
||||
"\n",
|
||||
"converter = TextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n",
|
||||
"doc_txt = converter.convert(file_path=\"data/preprocessing_tutorial/classics.txt\", meta=None)[0]\n",
|
||||
"doc_txt = converter.convert(file_path=Path(f\"{doc_dir}/classics.txt\"), meta=None)[0]\n",
|
||||
"\n",
|
||||
"converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=[\"en\"])\n",
|
||||
"doc_pdf = converter.convert(file_path=\"data/preprocessing_tutorial/bert.pdf\", meta=None)[0]\n",
|
||||
"doc_pdf = converter.convert(file_path=Path(f\"{doc_dir}/bert.pdf\"), meta=None)[0]\n",
|
||||
"\n",
|
||||
"converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=[\"en\"])\n",
|
||||
"doc_docx = converter.convert(file_path=\"data/preprocessing_tutorial/heavy_metal.docx\", meta=None)[0]"
|
||||
"doc_docx = converter.convert(file_path=Path(f\"{doc_dir}/heavy_metal.docx\"), meta=None)[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -195,7 +197,7 @@
|
||||
"source": [
|
||||
"# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.\n",
|
||||
"\n",
|
||||
"all_docs = convert_files_to_dicts(dir_path=\"data/preprocessing_tutorial\")"
|
||||
"all_docs = convert_files_to_dicts(dir_path=doc_dir)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@ -18,6 +18,8 @@ This tutorial will show you all the tools that Haystack provides to help you cas
|
||||
"""
|
||||
|
||||
# Here are the imports we need
|
||||
from pathlib import Path
|
||||
|
||||
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor
|
||||
from haystack.utils import convert_files_to_dicts, fetch_archive_from_http
|
||||
|
||||
@ -42,17 +44,17 @@ def tutorial8_preprocessing():
|
||||
# Here are some examples of how you would use file converters
|
||||
|
||||
converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
||||
doc_txt = converter.convert(file_path="data/preprocessing_tutorial/classics.txt", meta=None)[0]
|
||||
doc_txt = converter.convert(file_path=Path(f"{doc_dir}/classics.txt"), meta=None)[0]
|
||||
|
||||
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
||||
doc_pdf = converter.convert(file_path="data/preprocessing_tutorial/bert.pdf", meta=None)[0]
|
||||
doc_pdf = converter.convert(file_path=Path(f"{doc_dir}/bert.pdf"), meta=None)[0]
|
||||
|
||||
converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
|
||||
doc_docx = converter.convert(file_path="data/preprocessing_tutorial/heavy_metal.docx", meta=None)[0]
|
||||
doc_docx = converter.convert(file_path=Path(f"{doc_dir}/heavy_metal.docx"), meta=None)[0]
|
||||
|
||||
# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory.
|
||||
|
||||
all_docs = convert_files_to_dicts(dir_path="data/preprocessing_tutorial")
|
||||
all_docs = convert_files_to_dicts(dir_path=doc_dir)
|
||||
|
||||
"""
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user