diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1fa0b2ebb..165f3bb20 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -98,7 +98,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3 + run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' - name: Run run: pytest -m "not integration" test @@ -156,7 +156,7 @@ jobs: sudo apt install ffmpeg # for local Whisper tests - name: Install Haystack - run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3 + run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' - name: Run run: pytest --maxfail=5 -m "integration" test @@ -212,7 +212,7 @@ jobs: colima start - name: Install Haystack - run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3 + run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' - name: Run Tika run: docker run -d -p 9998:9998 apache/tika:2.9.0.0 @@ -263,7 +263,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3 + run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' - name: Run run: pytest --maxfail=5 -m "integration" test -k 'not tika' diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py index e3219aa9f..f5924cc36 100644 --- a/haystack/components/converters/html.py +++ b/haystack/components/converters/html.py @@ -1,16 +1,13 @@ import logging from pathlib import Path from typing import Any, Dict, List, Optional, Union +from boilerpy3 import extractors from haystack import Document, component from haystack.dataclasses import ByteStream -from haystack.lazy_imports import LazyImport logger = logging.getLogger(__name__) -with LazyImport("Run 'pip install boilerpy3'") as boilerpy3_import: - from boilerpy3 import extractors - @component class HTMLToDocument: @@ -30,12 +27,6 @@ class HTMLToDocument: """ - def __init__(self): - """ - Initializes the HTMLToDocument component. - """ - boilerpy3_import.check() - @component.output_types(documents=List[Document]) def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None): """ diff --git a/pyproject.toml b/pyproject.toml index c15a9c3d7..b33455163 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ dependencies = [ "more-itertools", # TextDocumentSplitter "networkx", # Pipeline graphs "typing_extensions", # typing support for Python 3.8 + "boilerpy3", # Fulltext extraction from HTML pages ] [project.optional-dependencies] diff --git a/releasenotes/notes/ship-boilerpy3-0bffbd7955c89dd4.yaml b/releasenotes/notes/ship-boilerpy3-0bffbd7955c89dd4.yaml new file mode 100644 index 000000000..b9349165c --- /dev/null +++ b/releasenotes/notes/ship-boilerpy3-0bffbd7955c89dd4.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Include 'boilerpy3' in the 'haystack-ai' dependencies.