mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 12:07:04 +00:00
feat: Load documents from remote - helper function (#4545)
* first draft of the load documents from remote function * resolving comments * pylint fixes * pylint fixes * fixed import * fixed black * fixing returned instance * pythonic list comprehension * Addressed comments --------- Co-authored-by: Mayank Jobanputra <mayankjobanputra@gmail.com>
This commit is contained in:
parent
52fb935936
commit
1a37caad79
@ -62,6 +62,17 @@ class PipelineError(HaystackError):
|
||||
super().__init__(message=message, docs_link=docs_link)
|
||||
|
||||
|
||||
class DatasetsError(HaystackError):
|
||||
"""Exception for issues raised within a dataset"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: Optional[str] = None,
|
||||
docs_link: Optional[str] = "https://docs.haystack.deepset.ai/docs/documents_answers_labels#document",
|
||||
):
|
||||
super().__init__(message=message, docs_link=docs_link)
|
||||
|
||||
|
||||
class PipelineSchemaError(PipelineError):
|
||||
"""Exception for issues arising when reading/building the JSON schema of pipelines"""
|
||||
|
||||
|
||||
@ -1,5 +1,3 @@
|
||||
from typing import Optional, Dict, Union, Tuple
|
||||
|
||||
import io
|
||||
import gzip
|
||||
import tarfile
|
||||
@ -8,8 +6,11 @@ import logging
|
||||
import importlib
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Union, Tuple, List
|
||||
|
||||
import requests
|
||||
from haystack.errors import DatasetsError
|
||||
from haystack.schema import Document
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -62,6 +63,35 @@ def _optional_component_not_installed(component: str, dep_group: str, source_err
|
||||
) from source_error
|
||||
|
||||
|
||||
def load_documents_from_hf_datasets(dataset_name: str, split: Optional[str] = "train") -> List[Document]:
|
||||
"""
|
||||
Load a list of Haystack Documents from a remote Hugging Face dataset.
|
||||
|
||||
:param dataset_name: A Hugging Face dataset containing Haystack Documents
|
||||
:param split: The split of the Hugging Face dataset to load from. By default, this is set to "train".
|
||||
:return: a List of Haystack Documents
|
||||
"""
|
||||
try:
|
||||
from datasets import load_dataset, load_dataset_builder
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Failed to import `datasets`, Run 'pip install datasets>=2.6.0' "
|
||||
"to install the datasets library to use this function."
|
||||
)
|
||||
|
||||
dataset = load_dataset_builder(dataset_name)
|
||||
if "content" not in dataset.info.features.keys():
|
||||
raise DatasetsError(
|
||||
f"{dataset_name} does not contain a `content` field which is required by Haystack to "
|
||||
f"create `Document` objects."
|
||||
)
|
||||
|
||||
remote_dataset = load_dataset(dataset_name, split=split)
|
||||
documents = [Document.from_dict(document) for document in remote_dataset]
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
def fetch_archive_from_http(
|
||||
url: str,
|
||||
output_dir: str,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user