feat: Load documents from remote - helper function (#4545)

* first draft of the load documents from remote function * resolving comments * pylint fixes * pylint fixes * fixed import * fixed black * fixing returned instance * pythonic list comprehension * Addressed comments --------- Co-authored-by: Mayank Jobanputra <mayankjobanputra@gmail.com>
2026-01-06 12:07:04 +00:00 · 2023-04-06 10:19:35 +02:00 · 2023-04-06 10:19:35 +02:00 · 1a37caad79
commit 1a37caad79
parent 52fb935936
2 changed files with 43 additions and 2 deletions
--- a/haystack/errors.py
+++ b/haystack/errors.py
@ -62,6 +62,17 @@ class PipelineError(HaystackError):
        super().__init__(message=message, docs_link=docs_link)


+class DatasetsError(HaystackError):
+    """Exception for issues raised within a dataset"""
+
+    def __init__(
+        self,
+        message: Optional[str] = None,
+        docs_link: Optional[str] = "https://docs.haystack.deepset.ai/docs/documents_answers_labels#document",
+    ):
+        super().__init__(message=message, docs_link=docs_link)
+
+
 class PipelineSchemaError(PipelineError):
    """Exception for issues arising when reading/building the JSON schema of pipelines"""

--- a/haystack/utils/import_utils.py
+++ b/haystack/utils/import_utils.py
@ -1,5 +1,3 @@
-from typing import Optional, Dict, Union, Tuple
-
 import io
 import gzip
 import tarfile
@ -8,8 +6,11 @@ import logging
 import importlib
 import importlib.util
 from pathlib import Path
+from typing import Optional, Dict, Union, Tuple, List

 import requests
+from haystack.errors import DatasetsError
+from haystack.schema import Document


 logger = logging.getLogger(__name__)
@ -62,6 +63,35 @@ def _optional_component_not_installed(component: str, dep_group: str, source_err
    ) from source_error


+def load_documents_from_hf_datasets(dataset_name: str, split: Optional[str] = "train") -> List[Document]:
+    """
+    Load a list of Haystack Documents from a remote Hugging Face dataset.
+
+    :param dataset_name: A Hugging Face dataset containing Haystack Documents
+    :param split: The split of the Hugging Face dataset to load from. By default, this is set to "train".
+    :return: a List of Haystack Documents
+    """
+    try:
+        from datasets import load_dataset, load_dataset_builder
+    except ImportError:
+        raise ImportError(
+            "Failed to import `datasets`, Run 'pip install datasets>=2.6.0' "
+            "to install the datasets library to use this function."
+        )
+
+    dataset = load_dataset_builder(dataset_name)
+    if "content" not in dataset.info.features.keys():
+        raise DatasetsError(
+            f"{dataset_name} does not contain a `content` field which is required by Haystack to "
+            f"create `Document` objects."
+        )
+
+    remote_dataset = load_dataset(dataset_name, split=split)
+    documents = [Document.from_dict(document) for document in remote_dataset]
+
+    return documents
+
+
 def fetch_archive_from_http(
    url: str,
    output_dir: str,