diff --git a/haystack/preview/components/file_converters/__init__.py b/haystack/preview/components/file_converters/__init__.py index 08f7a4613..ac7556075 100644 --- a/haystack/preview/components/file_converters/__init__.py +++ b/haystack/preview/components/file_converters/__init__.py @@ -1,5 +1,13 @@ from haystack.preview.components.file_converters.txt import TextFileToDocument from haystack.preview.components.file_converters.tika import TikaDocumentConverter from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter +from haystack.preview.components.file_converters.pypdf import PyPDFToDocument +from haystack.preview.components.file_converters.html import HTMLToDocument -__all__ = ["TextFileToDocument", "TikaDocumentConverter", "AzureOCRDocumentConverter"] +__all__ = [ + "TextFileToDocument", + "TikaDocumentConverter", + "AzureOCRDocumentConverter", + "PyPDFToDocument", + "HTMLToDocument", +] diff --git a/haystack/preview/components/file_converters/html.py b/haystack/preview/components/file_converters/html.py new file mode 100644 index 000000000..3cb7c0337 --- /dev/null +++ b/haystack/preview/components/file_converters/html.py @@ -0,0 +1,70 @@ +import logging +from typing import List, Optional, Dict, Any, Union +from pathlib import Path + +from haystack.preview.lazy_imports import LazyImport +from haystack.preview import Document, component, default_to_dict, default_from_dict + +with LazyImport("Run 'pip install boilerpy3'") as boilerpy3_import: + from boilerpy3 import extractors + + +logger = logging.getLogger(__name__) + + +@component +class HTMLToDocument: + """ + A component for converting an HTML file to a Document. + """ + + def __init__(self, id_hash_keys: Optional[List[str]] = None): + """ + Create a HTMLToDocument component. + + :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's + attributes. Default: `None` + """ + boilerpy3_import.check() + self.id_hash_keys = id_hash_keys or [] + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict(self, id_hash_keys=self.id_hash_keys) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, paths: List[Union[str, Path]]): + """ + Convert HTML files to Documents. + + :param paths: A list of paths to HTML files. + :return: A list of Documents. + """ + documents = [] + extractor = extractors.ArticleExtractor(raise_on_failure=False) + for path in paths: + try: + file_content = extractor.read_from_file(path) + except Exception as e: + logger.warning("Could not read file %s. Skipping it. Error message: %s", path, e) + continue + # although raise_on_failure is set to False, the extractor can still raise an exception + try: + text = extractor.get_content(file_content) + except Exception as conversion_e: + logger.warning("Could not extract raw txt from %s. Skipping it. Error message: %s", path, conversion_e) + continue + + document = Document(text=text, id_hash_keys=self.id_hash_keys) + documents.append(document) + + return {"documents": documents} diff --git a/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml b/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml new file mode 100644 index 000000000..1ff9b25b6 --- /dev/null +++ b/releasenotes/notes/add-html-to-document-21fe38b244388f4d.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Adds HTMLToDocument component to convert HTML to a Document. diff --git a/test/preview/components/file_converters/test_html_to_document.py b/test/preview/components/file_converters/test_html_to_document.py new file mode 100644 index 000000000..b46f84128 --- /dev/null +++ b/test/preview/components/file_converters/test_html_to_document.py @@ -0,0 +1,63 @@ +import logging + +import pytest + +from haystack.preview.components.file_converters import HTMLToDocument + + +class TestHTMLToDocument: + @pytest.mark.unit + def test_to_dict(self): + component = HTMLToDocument() + data = component.to_dict() + assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": []}} + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + component = HTMLToDocument(id_hash_keys=["name"]) + data = component.to_dict() + assert data == {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}} + + @pytest.mark.unit + def test_from_dict(self): + data = {"type": "HTMLToDocument", "init_parameters": {"id_hash_keys": ["name"]}} + component = HTMLToDocument.from_dict(data) + assert component.id_hash_keys == ["name"] + + @pytest.mark.unit + def test_run(self, preview_samples_path): + """ + Test if the component runs correctly. + """ + paths = [preview_samples_path / "html" / "what_is_haystack.html"] + converter = HTMLToDocument() + output = converter.run(paths=paths) + docs = output["documents"] + assert len(docs) == 1 + assert "Haystack" in docs[0].text + + @pytest.mark.unit + def test_run_wrong_file_type(self, preview_samples_path, caplog): + """ + Test if the component runs correctly when an input file is not of the expected type. + """ + paths = [preview_samples_path / "audio" / "answer.wav"] + converter = HTMLToDocument() + with caplog.at_level(logging.WARNING): + output = converter.run(paths=paths) + assert "codec can't decode byte" in caplog.text + + docs = output["documents"] + assert docs == [] + + @pytest.mark.unit + def test_run_error_handling(self, preview_samples_path, caplog): + """ + Test if the component correctly handles errors. + """ + paths = ["non_existing_file.html"] + converter = HTMLToDocument() + with caplog.at_level(logging.WARNING): + result = converter.run(paths=paths) + assert "Could not read file non_existing_file.html" in caplog.text + assert result["documents"] == [] diff --git a/test/preview/test_files/html/what_is_haystack.html b/test/preview/test_files/html/what_is_haystack.html new file mode 100644 index 000000000..2d62b206c --- /dev/null +++ b/test/preview/test_files/html/what_is_haystack.html @@ -0,0 +1,1634 @@ + + + + + + + + + + What is Haystack? | Haystack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ 🎃 We're participating in Hacktoberfest 2023! + + + + + +
+
+ + + +
+ +
+
+
+ + + + + +
+

What is Haystack?

+

Haystack is the open source Python framework by deepset for building custom apps with large language models (LLMs). It lets you quickly try out the latest models in natural language processing (NLP) while being flexible and easy to use. Our inspiring community of users and builders has helped shape Haystack into what it is today: a complete framework for building production-ready NLP apps.

+

Building with Haystack

+

Haystack offers comprehensive tooling for developing state-of-the-art NLP systems that use LLMs (such as GPT-4, Falcon and similar) and Transformer models . With Haystack, you can effortlessly experiment with various models hosted on platforms like Hugging Face, OpenAI, Cohere, or even models deployed on SageMaker and your local models to find the perfect fit for your use case.

+ + + + + + + + + + + + + + + + + + + + + Model Providers + + +

Some examples of what you can build include:

+
    +
  • Semantic search on a large collection of documents in any language
  • +
  • Generative question answering on a knowledge base containing mixed types of information: images, text, and tables.
  • +
  • Natural language chatbots powered by cutting-edge generative models like GPT-4
  • +
  • An LLM-based Haystack Agent capable of resolving complex queries
  • +
  • Information extraction from documents to populate your database or build a knowledge graph
  • +
+

This is just a small subset of the kinds of systems that can be created in Haystack.

+

Functionality for all stages of an NLP project

+

A successful NLP project requires more than just the language models. As an end-to-end framework, Haystack assists you in building your system every step of the way, offering tooling for each stage of the NLP project life cycle:

+ +

But that’s not all: +metadata filtering, +model distillation, or the prompt hub, whatever your NLP heart desires, you’re likely to find it in Haystack. And if not? We’ll build it together.

+ + + + + + + + + + + + + + + + + + + + + + + Rest API + + +

Building blocks

+

Haystack uses a few simple but effective concepts to help you build fully functional and customized end-to-end NLP systems.

+

Components

+

At the core of Haystack are its components—fundamental building blocks that can perform tasks like document retrieval, text generation, or summarization. A single component is already quite powerful. It can manage local language models or communicate with a hosted model through an API.

+

While Haystack offers a bunch of components you can use out of the box, it also lets you create your own custom components. Explore the +collection of integrations that includes custom components developed by our community, which you can freely use.

+

You can chain components together to build pipelines, which are the foundation of the NLP app architecture in Haystack.

+

Pipelines

+

Pipelines are powerful structures made up of components, such as a Retriever and Reader, connected to infrastructure building blocks, such as a DocumentStore (for example, Elasticsearch or Weaviate) to form complex systems.

+

Haystack offers ready-made pipelines for most common tasks, such as question answering, document retrieval, or summarization. But it’s just as easy to design and create a custom pipeline for NLP scenarios that are way more complex than question answering.

+

Agents

+

The Haystack Agent makes use of a large language model to resolve complex tasks. When initializing the Agent, you give it a set of tools, which can be pipeline components or whole pipelines. The Agent can use to those tools iteratively to arrive at an answer. When given a query, the Agent determines which tools are useful to answer this query and calls them in a loop until it gets the answer. This way, it can achieve much more than extractive or generative question answering pipelines.

+ + + + + + + + + + + + + + + + + + + + + Agent Tools + + +

Who’s it for?

+

Haystack is for everyone looking to build natural language apps—NLP enthusiasts and newbies alike. You don’t need to understand how the models work under the hood. With Haystack’s modular and flexible components, pipelines, and agents, all you need is some basic knowledge of Python to dive right in.

+

Our community

+

At the heart of Haystack is the vibrant open source community that thrives on the diverse backgrounds and skill sets of its members. We value collaboration greatly and encourage our users to shape Haystack actively through GitHub contributions. Our Discord channel is a space where community members can connect, seek help, and learn from each other.

+

We also organize live online and in-person events, webinars, and office hours, which are an opportunity to learn and grow.

+ + + + + + + + +
+ + + +
+ Join Discord +
+ + + +
+
+ +

Enter the Haystack universe

+ + + + +
+ + + +
+ +
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +