chore: Change trafilatura dependency to use lazy import (#7809)

* Change trafilatura dependency to use lazy import

* Add release notes
This commit is contained in:
Silvano Cerza 2024-06-05 18:04:24 +02:00 committed by GitHub
parent d815c78198
commit 23011c215e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 13 additions and 3 deletions

View File

@ -6,14 +6,16 @@ import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from trafilatura import extract
from haystack import Document, component, default_from_dict, default_to_dict, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport
logger = logging.getLogger(__name__)
with LazyImport("Run 'pip install trafilatura'") as trafilatura_import:
from trafilatura import extract
@component
class HTMLToDocument:
@ -49,6 +51,7 @@ class HTMLToDocument:
are passed to the underlying Trafilatura `extract` function. For the full list of available arguments, see
the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
"""
trafilatura_import.check()
if extractor_type is not None:
warnings.warn(
"The `extractor_type` parameter is ignored and will be removed in Haystack 2.4.0. "

View File

@ -57,7 +57,6 @@ dependencies = [
"more-itertools", # TextDocumentSplitter
"networkx", # Pipeline graphs
"typing_extensions>=4.7", # typing support for Python 3.8
"trafilatura", # Fulltext extraction from HTML pages
"requests",
"numpy",
"python-dateutil",
@ -117,6 +116,7 @@ extra-dependencies = [
"langdetect", # TextLanguageRouter and DocumentLanguageClassifier
"sentence-transformers>=2.2.0", # SentenceTransformersTextEmbedder and SentenceTransformersDocumentEmbedder
"openai-whisper>=20231106", # LocalWhisperTranscriber
"trafilatura", # Fulltext extraction from HTML pages
# OpenAPI
"jsonref", # OpenAPIServiceConnector, OpenAPIServiceToFunctions

View File

@ -0,0 +1,7 @@
---
upgrade:
- |
`trafilatura` must now be manually installed with `pip install trafilatura` to use the `HTMLToDocument` Component.
enhancements:
- |
Remove `trafilatura` as direct dependency and make it a lazily imported one