chore: add boilerpy3 to the core dependencies (#6544)

* add boilerpy3 to the core dependencies

* remove boilerpy3 installation from test workflow

* fix pylint: import order and unused import

* fix import order

* add release note

---------

Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
Massimiliano Pippi 2023-12-14 11:53:38 +01:00 committed by GitHub
parent 0ec2801c72
commit bc45170f4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 10 additions and 14 deletions

View File

@ -98,7 +98,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2'
- name: Run
run: pytest -m "not integration" test
@ -156,7 +156,7 @@ jobs:
sudo apt install ffmpeg # for local Whisper tests
- name: Install Haystack
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2'
- name: Run
run: pytest --maxfail=5 -m "integration" test
@ -212,7 +212,7 @@ jobs:
colima start
- name: Install Haystack
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2'
- name: Run Tika
run: docker run -d -p 9998:9998 apache/tika:2.9.0.0
@ -263,7 +263,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2'
- name: Run
run: pytest --maxfail=5 -m "integration" test -k 'not tika'

View File

@ -1,16 +1,13 @@
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from boilerpy3 import extractors
from haystack import Document, component
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport
logger = logging.getLogger(__name__)
with LazyImport("Run 'pip install boilerpy3'") as boilerpy3_import:
from boilerpy3 import extractors
@component
class HTMLToDocument:
@ -30,12 +27,6 @@ class HTMLToDocument:
"""
def __init__(self):
"""
Initializes the HTMLToDocument component.
"""
boilerpy3_import.check()
@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
"""

View File

@ -58,6 +58,7 @@ dependencies = [
"more-itertools", # TextDocumentSplitter
"networkx", # Pipeline graphs
"typing_extensions", # typing support for Python 3.8
"boilerpy3", # Fulltext extraction from HTML pages
]
[project.optional-dependencies]

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Include 'boilerpy3' in the 'haystack-ai' dependencies.