feat: Load documents from remote - helper function (#4545)

* first draft of the load documents from remote function

* resolving comments

* pylint fixes

* pylint fixes

* fixed import

* fixed black

* fixing returned instance

* pythonic list comprehension

* Addressed comments

---------

Co-authored-by: Mayank Jobanputra <mayankjobanputra@gmail.com>
This commit is contained in:
Tuana Çelik 2023-04-06 10:19:35 +02:00 committed by GitHub
parent 52fb935936
commit 1a37caad79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 43 additions and 2 deletions

View File

@ -62,6 +62,17 @@ class PipelineError(HaystackError):
super().__init__(message=message, docs_link=docs_link)
class DatasetsError(HaystackError):
"""Exception for issues raised within a dataset"""
def __init__(
self,
message: Optional[str] = None,
docs_link: Optional[str] = "https://docs.haystack.deepset.ai/docs/documents_answers_labels#document",
):
super().__init__(message=message, docs_link=docs_link)
class PipelineSchemaError(PipelineError):
"""Exception for issues arising when reading/building the JSON schema of pipelines"""

View File

@ -1,5 +1,3 @@
from typing import Optional, Dict, Union, Tuple
import io
import gzip
import tarfile
@ -8,8 +6,11 @@ import logging
import importlib
import importlib.util
from pathlib import Path
from typing import Optional, Dict, Union, Tuple, List
import requests
from haystack.errors import DatasetsError
from haystack.schema import Document
logger = logging.getLogger(__name__)
@ -62,6 +63,35 @@ def _optional_component_not_installed(component: str, dep_group: str, source_err
) from source_error
def load_documents_from_hf_datasets(dataset_name: str, split: Optional[str] = "train") -> List[Document]:
"""
Load a list of Haystack Documents from a remote Hugging Face dataset.
:param dataset_name: A Hugging Face dataset containing Haystack Documents
:param split: The split of the Hugging Face dataset to load from. By default, this is set to "train".
:return: a List of Haystack Documents
"""
try:
from datasets import load_dataset, load_dataset_builder
except ImportError:
raise ImportError(
"Failed to import `datasets`, Run 'pip install datasets>=2.6.0' "
"to install the datasets library to use this function."
)
dataset = load_dataset_builder(dataset_name)
if "content" not in dataset.info.features.keys():
raise DatasetsError(
f"{dataset_name} does not contain a `content` field which is required by Haystack to "
f"create `Document` objects."
)
remote_dataset = load_dataset(dataset_name, split=split)
documents = [Document.from_dict(document) for document in remote_dataset]
return documents
def fetch_archive_from_http(
url: str,
output_dir: str,