mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 12:37:27 +00:00
chore: add boilerpy3 to the core dependencies (#6544)
* add boilerpy3 to the core dependencies * remove boilerpy3 installation from test workflow * fix pylint: import order and unused import * fix import order * add release note --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
This commit is contained in:
parent
0ec2801c72
commit
bc45170f4e
8
.github/workflows/tests.yml
vendored
8
.github/workflows/tests.yml
vendored
@ -98,7 +98,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3
|
||||
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||
|
||||
- name: Run
|
||||
run: pytest -m "not integration" test
|
||||
@ -156,7 +156,7 @@ jobs:
|
||||
sudo apt install ffmpeg # for local Whisper tests
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3
|
||||
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||
|
||||
- name: Run
|
||||
run: pytest --maxfail=5 -m "integration" test
|
||||
@ -212,7 +212,7 @@ jobs:
|
||||
colima start
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3
|
||||
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||
|
||||
- name: Run Tika
|
||||
run: docker run -d -p 9998:9998 apache/tika:2.9.0.0
|
||||
@ -263,7 +263,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' boilerpy3
|
||||
run: pip install .[dev,audio] langdetect transformers[torch,sentencepiece]==4.35.2 'sentence-transformers>=2.2.0' pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||
|
||||
- name: Run
|
||||
run: pytest --maxfail=5 -m "integration" test -k 'not tika'
|
||||
|
||||
@ -1,16 +1,13 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from boilerpy3 import extractors
|
||||
|
||||
from haystack import Document, component
|
||||
from haystack.dataclasses import ByteStream
|
||||
from haystack.lazy_imports import LazyImport
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
with LazyImport("Run 'pip install boilerpy3'") as boilerpy3_import:
|
||||
from boilerpy3 import extractors
|
||||
|
||||
|
||||
@component
|
||||
class HTMLToDocument:
|
||||
@ -30,12 +27,6 @@ class HTMLToDocument:
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initializes the HTMLToDocument component.
|
||||
"""
|
||||
boilerpy3_import.check()
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
|
||||
"""
|
||||
|
||||
@ -58,6 +58,7 @@ dependencies = [
|
||||
"more-itertools", # TextDocumentSplitter
|
||||
"networkx", # Pipeline graphs
|
||||
"typing_extensions", # typing support for Python 3.8
|
||||
"boilerpy3", # Fulltext extraction from HTML pages
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
4
releasenotes/notes/ship-boilerpy3-0bffbd7955c89dd4.yaml
Normal file
4
releasenotes/notes/ship-boilerpy3-0bffbd7955c89dd4.yaml
Normal file
@ -0,0 +1,4 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Include 'boilerpy3' in the 'haystack-ai' dependencies.
|
||||
Loading…
x
Reference in New Issue
Block a user