mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-07 20:46:31 +00:00
refactor!: extract preprocessing and file conversion deps (#4605)
* isolate file-conversion deps * pylint * add to all extra * chain was missing * move langdetect into preprocessing and fix tika * add file-conversion extra
This commit is contained in:
parent
16091f6ad2
commit
d8ac30fa47
21
.github/workflows/tests.yml
vendored
21
.github/workflows/tests.yml
vendored
@ -240,7 +240,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,docstores]
|
||||
run: pip install .[dev,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
@ -318,7 +318,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,sql]
|
||||
run: pip install .[dev,sql,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
@ -404,7 +404,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,docstores]
|
||||
run: pip install .[dev,opensearch,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
@ -482,7 +482,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev]
|
||||
run: pip install .[dev,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
@ -560,7 +560,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[faiss,dev]
|
||||
run: pip install .[faiss,dev,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
@ -648,7 +648,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,docstores]
|
||||
run: pip install .[dev,weaviate,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
@ -726,7 +726,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,pinecone]
|
||||
run: pip install .[dev,pinecone,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
env:
|
||||
@ -805,7 +805,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev]
|
||||
run: pip install .[dev,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
@ -865,7 +865,6 @@ jobs:
|
||||
}
|
||||
|
||||
|
||||
|
||||
integration-tests-promptnode:
|
||||
name: Integration / PromptNode / ${{ matrix.os }}
|
||||
needs:
|
||||
@ -883,7 +882,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev]
|
||||
run: pip install .[dev,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
@ -961,7 +960,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev]
|
||||
run: pip install .[dev,preprocessing]
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
|
||||
@ -45,7 +45,7 @@ target "base-cpu" {
|
||||
build_image = "python:3.10-slim"
|
||||
base_image = "python:3.10-slim"
|
||||
haystack_version = "${HAYSTACK_VERSION}"
|
||||
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,ocr,onnx,metrics,beir]"
|
||||
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,file-conversion,ocr,onnx,metrics,beir]"
|
||||
}
|
||||
platforms = ["linux/amd64", "linux/arm64"]
|
||||
}
|
||||
@ -59,7 +59,7 @@ target "base-gpu" {
|
||||
build_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
|
||||
base_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
|
||||
haystack_version = "${HAYSTACK_VERSION}"
|
||||
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,ocr,onnx-gpu,metrics]"
|
||||
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,file-conversion,ocr,onnx-gpu,metrics]"
|
||||
}
|
||||
platforms = ["linux/amd64", "linux/arm64"]
|
||||
}
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
|
||||
from langdetect import LangDetectException, detect
|
||||
|
||||
from haystack.nodes.base import Document
|
||||
from haystack.nodes.doc_language_classifier.base import BaseDocumentLanguageClassifier
|
||||
@ -9,6 +8,16 @@ from haystack.nodes.doc_language_classifier.base import BaseDocumentLanguageClas
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
try:
|
||||
import langdetect
|
||||
except ImportError as exc:
|
||||
logger.debug(
|
||||
"langdetect could not be imported. "
|
||||
"Run 'pip install farm-haystack[file-conversion]' or 'pip install langdetect' to fix this issue."
|
||||
)
|
||||
langdetect = None
|
||||
|
||||
|
||||
class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
|
||||
"""
|
||||
A node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for classifying the language of documents.
|
||||
@ -47,8 +56,14 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
|
||||
def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None):
|
||||
"""
|
||||
:param route_by_language: Sends Documents to a different output edge depending on their language.
|
||||
:param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
|
||||
:param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see
|
||||
[langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
|
||||
"""
|
||||
if not langdetect:
|
||||
raise ImportError(
|
||||
"langdetect could not be imported. "
|
||||
"Run 'pip install farm-haystack[file-conversion]' or 'pip install langdetect' to fix this issue."
|
||||
)
|
||||
super().__init__(route_by_language=route_by_language, languages_to_route=languages_to_route)
|
||||
|
||||
def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
|
||||
@ -69,8 +84,8 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
|
||||
documents_with_language = []
|
||||
for document in documents:
|
||||
try:
|
||||
language = detect(document.content)
|
||||
except LangDetectException:
|
||||
language = langdetect.detect(document.content)
|
||||
except langdetect.LangDetectException:
|
||||
logger.warning("Langdetect cannot detect the language of document: %s", document)
|
||||
language = None
|
||||
document.meta["language"] = language
|
||||
|
||||
@ -2,7 +2,6 @@ from typing import Dict, Optional, List
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import docx
|
||||
|
||||
from haystack.nodes.file_converter.base import BaseConverter
|
||||
from haystack.schema import Document
|
||||
@ -11,7 +10,36 @@ from haystack.schema import Document
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
try:
|
||||
import docx
|
||||
except ImportError as exc:
|
||||
logger.debug(
|
||||
"docx could not be imported. "
|
||||
"Run 'pip install farm-haystack[file-conversion]' or 'pip install python-docx' to fix this issue."
|
||||
)
|
||||
docx = None
|
||||
|
||||
|
||||
class DocxToTextConverter(BaseConverter):
|
||||
def __init__(
|
||||
self,
|
||||
remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
progress_bar: bool = True,
|
||||
):
|
||||
if not docx:
|
||||
raise ImportError(
|
||||
"docx could not be imported. "
|
||||
"Run 'pip install farm-haystack[file-conversion]' or 'pip install python-docx' to fix this issue."
|
||||
)
|
||||
super().__init__(
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
valid_languages=valid_languages,
|
||||
id_hash_keys=id_hash_keys,
|
||||
progress_bar=progress_bar,
|
||||
)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_path: Path,
|
||||
|
||||
@ -7,13 +7,24 @@ import subprocess
|
||||
from html.parser import HTMLParser
|
||||
|
||||
import requests
|
||||
from tika import parser as tikaparser
|
||||
|
||||
from haystack.nodes.file_converter.base import BaseConverter
|
||||
from haystack.schema import Document
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
try:
|
||||
from tika import parser as tika_parser
|
||||
except ImportError as exc:
|
||||
logger.debug(
|
||||
"tika could not be imported. "
|
||||
"Run 'pip install farm-haystack[file-conversion]' or 'pip install tika' to fix this issue."
|
||||
)
|
||||
tika_parser = None
|
||||
|
||||
|
||||
TIKA_CONTAINER_NAME = "tika"
|
||||
|
||||
|
||||
@ -96,6 +107,11 @@ class TikaConverter(BaseConverter):
|
||||
as a float, or a :ref:`(connect timeout, read timeout) <timeouts>` tuple.
|
||||
Defaults to 10 seconds.
|
||||
"""
|
||||
if not tika_parser:
|
||||
raise ImportError(
|
||||
"tika could not be imported. "
|
||||
"Run 'pip install farm-haystack[file-conversion]' or 'pip install tika' to fix this issue."
|
||||
)
|
||||
super().__init__(
|
||||
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
|
||||
)
|
||||
@ -146,7 +162,7 @@ class TikaConverter(BaseConverter):
|
||||
if id_hash_keys is None:
|
||||
id_hash_keys = self.id_hash_keys
|
||||
|
||||
parsed = tikaparser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True)
|
||||
parsed = tika_parser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True)
|
||||
parser = TikaXHTMLParser()
|
||||
parser.feed(parsed["content"])
|
||||
|
||||
|
||||
@ -1,21 +1,16 @@
|
||||
from typing import List, Optional, Generator, Set, Union, Tuple, Dict, Literal
|
||||
|
||||
import logging
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from functools import partial, reduce
|
||||
from itertools import chain
|
||||
from typing import List, Optional, Generator, Set, Union, Tuple, Dict
|
||||
|
||||
try:
|
||||
from typing import Literal
|
||||
except ImportError:
|
||||
from typing_extensions import Literal # type: ignore
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from pickle import UnpicklingError
|
||||
|
||||
import nltk
|
||||
from more_itertools import windowed
|
||||
from tqdm.auto import tqdm
|
||||
from more_itertools import windowed
|
||||
|
||||
from haystack.nodes.preprocessor.base import BasePreProcessor
|
||||
from haystack.errors import HaystackError
|
||||
@ -25,6 +20,16 @@ from haystack.schema import Document
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
try:
|
||||
import nltk
|
||||
except ImportError as exc:
|
||||
logger.debug(
|
||||
"nltk could not be imported. "
|
||||
"Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk' to fix this issue."
|
||||
)
|
||||
nltk = None
|
||||
|
||||
|
||||
iso639_to_nltk = {
|
||||
"ru": "russian",
|
||||
"sl": "slovene",
|
||||
@ -105,10 +110,12 @@ class PreProcessor(BasePreProcessor):
|
||||
super().__init__()
|
||||
|
||||
try:
|
||||
nltk.data.find("tokenizers/punkt")
|
||||
if nltk:
|
||||
nltk.data.find("tokenizers/punkt")
|
||||
except LookupError:
|
||||
try:
|
||||
nltk.download("punkt")
|
||||
if nltk:
|
||||
nltk.download("punkt")
|
||||
except FileExistsError as error:
|
||||
logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error)
|
||||
pass
|
||||
@ -805,7 +812,12 @@ class PreProcessor(BasePreProcessor):
|
||||
sentences = sentence_tokenizer.tokenize(text)
|
||||
return sentences
|
||||
|
||||
def _load_sentence_tokenizer(self, language_name: Optional[str]) -> nltk.tokenize.punkt.PunktSentenceTokenizer:
|
||||
def _load_sentence_tokenizer(self, language_name: Optional[str]) -> "nltk.tokenize.punkt.PunktSentenceTokenizer":
|
||||
if not nltk:
|
||||
raise ImportError(
|
||||
"nltk could not be imported. "
|
||||
"Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk' to fix this issue."
|
||||
)
|
||||
# Try to load a custom model from 'tokenizer_model_path'
|
||||
if self.tokenizer_model_folder is not None:
|
||||
tokenizer_model_path = Path(self.tokenizer_model_folder).absolute() / f"{self.language}.pickle"
|
||||
|
||||
@ -50,7 +50,6 @@ dependencies = [
|
||||
"pydantic",
|
||||
"transformers[torch]==4.25.1",
|
||||
"protobuf<=3.20.2", # same version they use in transformers[sentencepiece]
|
||||
"nltk",
|
||||
"pandas",
|
||||
"rank_bm25",
|
||||
"scikit-learn>=1.0.0", # TF-IDF, SklearnQueryClassifier and metrics
|
||||
@ -67,16 +66,11 @@ dependencies = [
|
||||
"huggingface-hub>=0.5.0",
|
||||
"tenacity", # retry decorator
|
||||
"sseclient-py", # server side events for OpenAI streaming
|
||||
"more_itertools", # utilities
|
||||
|
||||
# Web Retriever
|
||||
"boilerpy3",
|
||||
|
||||
# Preprocessing
|
||||
"more_itertools", # for windowing
|
||||
"python-docx",
|
||||
"langdetect", # for PDF conversions
|
||||
"tika", # Apache Tika (text & metadata extractor)
|
||||
|
||||
# See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
|
||||
"sentence-transformers>=2.2.0",
|
||||
|
||||
@ -157,6 +151,12 @@ crawler = [
|
||||
"webdriver-manager",
|
||||
]
|
||||
preprocessing = [
|
||||
"nltk",
|
||||
"langdetect", # for language classification
|
||||
]
|
||||
file-conversion = [
|
||||
"python-docx",
|
||||
"tika", # Apache Tika (text & metadata extractor)
|
||||
"beautifulsoup4",
|
||||
"markdown",
|
||||
"python-frontmatter",
|
||||
@ -224,11 +224,11 @@ formatting = [
|
||||
]
|
||||
|
||||
all = [
|
||||
"farm-haystack[docstores,audio,crawler,preprocessing,pdf,ocr,ray,dev,onnx,beir,metrics]",
|
||||
"farm-haystack[docstores,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,dev,onnx,beir,metrics]",
|
||||
]
|
||||
all-gpu = [
|
||||
# beir is incompatible with faiss-gpu: https://github.com/beir-cellar/beir/issues/71
|
||||
"farm-haystack[docstores-gpu,audio,crawler,preprocessing,pdf,ocr,ray,dev,onnx-gpu,metrics]",
|
||||
"farm-haystack[docstores-gpu,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,dev,onnx-gpu,metrics]",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user