refactor!: extract preprocessing and file conversion deps (#4605)

* isolate file-conversion deps

* pylint

* add to all extra

* chain was missing

* move langdetect into preprocessing and fix tika

* add file-conversion extra
This commit is contained in:
ZanSara 2023-04-14 11:34:16 +02:00 committed by GitHub
parent 16091f6ad2
commit d8ac30fa47
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 110 additions and 40 deletions

View File

@ -240,7 +240,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev,docstores]
run: pip install .[dev,preprocessing]
- name: Run tests
run: |
@ -318,7 +318,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev,sql]
run: pip install .[dev,sql,preprocessing]
- name: Run tests
run: |
@ -404,7 +404,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev,docstores]
run: pip install .[dev,opensearch,preprocessing]
- name: Run tests
run: |
@ -482,7 +482,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev]
run: pip install .[dev,preprocessing]
- name: Run tests
run: |
@ -560,7 +560,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[faiss,dev]
run: pip install .[faiss,dev,preprocessing]
- name: Run tests
run: |
@ -648,7 +648,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev,docstores]
run: pip install .[dev,weaviate,preprocessing]
- name: Run tests
run: |
@ -726,7 +726,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev,pinecone]
run: pip install .[dev,pinecone,preprocessing]
- name: Run tests
env:
@ -805,7 +805,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev]
run: pip install .[dev,preprocessing]
- name: Run tests
run: |
@ -865,7 +865,6 @@ jobs:
}
integration-tests-promptnode:
name: Integration / PromptNode / ${{ matrix.os }}
needs:
@ -883,7 +882,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev]
run: pip install .[dev,preprocessing]
- name: Run tests
run: |
@ -961,7 +960,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[dev]
run: pip install .[dev,preprocessing]
- name: Run tests
run: |

View File

@ -45,7 +45,7 @@ target "base-cpu" {
build_image = "python:3.10-slim"
base_image = "python:3.10-slim"
haystack_version = "${HAYSTACK_VERSION}"
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,ocr,onnx,metrics,beir]"
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,file-conversion,ocr,onnx,metrics,beir]"
}
platforms = ["linux/amd64", "linux/arm64"]
}
@ -59,7 +59,7 @@ target "base-gpu" {
build_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
base_image = "pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime"
haystack_version = "${HAYSTACK_VERSION}"
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,ocr,onnx-gpu,metrics]"
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,file-conversion,ocr,onnx-gpu,metrics]"
}
platforms = ["linux/amd64", "linux/arm64"]
}

View File

@ -1,7 +1,6 @@
import logging
from typing import List, Optional
from langdetect import LangDetectException, detect
from haystack.nodes.base import Document
from haystack.nodes.doc_language_classifier.base import BaseDocumentLanguageClassifier
@ -9,6 +8,16 @@ from haystack.nodes.doc_language_classifier.base import BaseDocumentLanguageClas
logger = logging.getLogger(__name__)
try:
import langdetect
except ImportError as exc:
logger.debug(
"langdetect could not be imported. "
"Run 'pip install farm-haystack[file-conversion]' or 'pip install langdetect' to fix this issue."
)
langdetect = None
class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
"""
A node based on the lightweight and fast [langdetect library](https://github.com/Mimino666/langdetect) for classifying the language of documents.
@ -47,8 +56,14 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
def __init__(self, route_by_language: bool = True, languages_to_route: Optional[List[str]] = None):
"""
:param route_by_language: Sends Documents to a different output edge depending on their language.
:param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see [langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
:param languages_to_route: A list of languages in ISO code, each corresponding to a different output edge (see
[langdetect` documentation](https://github.com/Mimino666/langdetect#languages)).
"""
if not langdetect:
raise ImportError(
"langdetect could not be imported. "
"Run 'pip install farm-haystack[file-conversion]' or 'pip install langdetect' to fix this issue."
)
super().__init__(route_by_language=route_by_language, languages_to_route=languages_to_route)
def predict(self, documents: List[Document], batch_size: Optional[int] = None) -> List[Document]:
@ -69,8 +84,8 @@ class LangdetectDocumentLanguageClassifier(BaseDocumentLanguageClassifier):
documents_with_language = []
for document in documents:
try:
language = detect(document.content)
except LangDetectException:
language = langdetect.detect(document.content)
except langdetect.LangDetectException:
logger.warning("Langdetect cannot detect the language of document: %s", document)
language = None
document.meta["language"] = language

View File

@ -2,7 +2,6 @@ from typing import Dict, Optional, List
import logging
from pathlib import Path
import docx
from haystack.nodes.file_converter.base import BaseConverter
from haystack.schema import Document
@ -11,7 +10,36 @@ from haystack.schema import Document
logger = logging.getLogger(__name__)
try:
import docx
except ImportError as exc:
logger.debug(
"docx could not be imported. "
"Run 'pip install farm-haystack[file-conversion]' or 'pip install python-docx' to fix this issue."
)
docx = None
class DocxToTextConverter(BaseConverter):
def __init__(
self,
remove_numeric_tables: bool = False,
valid_languages: Optional[List[str]] = None,
id_hash_keys: Optional[List[str]] = None,
progress_bar: bool = True,
):
if not docx:
raise ImportError(
"docx could not be imported. "
"Run 'pip install farm-haystack[file-conversion]' or 'pip install python-docx' to fix this issue."
)
super().__init__(
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
id_hash_keys=id_hash_keys,
progress_bar=progress_bar,
)
def convert(
self,
file_path: Path,

View File

@ -7,13 +7,24 @@ import subprocess
from html.parser import HTMLParser
import requests
from tika import parser as tikaparser
from haystack.nodes.file_converter.base import BaseConverter
from haystack.schema import Document
logger = logging.getLogger(__name__)
try:
from tika import parser as tika_parser
except ImportError as exc:
logger.debug(
"tika could not be imported. "
"Run 'pip install farm-haystack[file-conversion]' or 'pip install tika' to fix this issue."
)
tika_parser = None
TIKA_CONTAINER_NAME = "tika"
@ -96,6 +107,11 @@ class TikaConverter(BaseConverter):
as a float, or a :ref:`(connect timeout, read timeout) <timeouts>` tuple.
Defaults to 10 seconds.
"""
if not tika_parser:
raise ImportError(
"tika could not be imported. "
"Run 'pip install farm-haystack[file-conversion]' or 'pip install tika' to fix this issue."
)
super().__init__(
remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
)
@ -146,7 +162,7 @@ class TikaConverter(BaseConverter):
if id_hash_keys is None:
id_hash_keys = self.id_hash_keys
parsed = tikaparser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True)
parsed = tika_parser.from_file(file_path.as_posix(), self.tika_url, xmlContent=True)
parser = TikaXHTMLParser()
parser.feed(parsed["content"])

View File

@ -1,21 +1,16 @@
from typing import List, Optional, Generator, Set, Union, Tuple, Dict, Literal
import logging
import re
from copy import deepcopy
from functools import partial, reduce
from itertools import chain
from typing import List, Optional, Generator, Set, Union, Tuple, Dict
try:
from typing import Literal
except ImportError:
from typing_extensions import Literal # type: ignore
import warnings
from pathlib import Path
from pickle import UnpicklingError
import nltk
from more_itertools import windowed
from tqdm.auto import tqdm
from more_itertools import windowed
from haystack.nodes.preprocessor.base import BasePreProcessor
from haystack.errors import HaystackError
@ -25,6 +20,16 @@ from haystack.schema import Document
logger = logging.getLogger(__name__)
try:
import nltk
except ImportError as exc:
logger.debug(
"nltk could not be imported. "
"Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk' to fix this issue."
)
nltk = None
iso639_to_nltk = {
"ru": "russian",
"sl": "slovene",
@ -105,10 +110,12 @@ class PreProcessor(BasePreProcessor):
super().__init__()
try:
nltk.data.find("tokenizers/punkt")
if nltk:
nltk.data.find("tokenizers/punkt")
except LookupError:
try:
nltk.download("punkt")
if nltk:
nltk.download("punkt")
except FileExistsError as error:
logger.debug("NLTK punkt tokenizer seems to be already downloaded. Error message: %s", error)
pass
@ -805,7 +812,12 @@ class PreProcessor(BasePreProcessor):
sentences = sentence_tokenizer.tokenize(text)
return sentences
def _load_sentence_tokenizer(self, language_name: Optional[str]) -> nltk.tokenize.punkt.PunktSentenceTokenizer:
def _load_sentence_tokenizer(self, language_name: Optional[str]) -> "nltk.tokenize.punkt.PunktSentenceTokenizer":
if not nltk:
raise ImportError(
"nltk could not be imported. "
"Run 'pip install farm-haystack[preprocessing]' or 'pip install nltk' to fix this issue."
)
# Try to load a custom model from 'tokenizer_model_path'
if self.tokenizer_model_folder is not None:
tokenizer_model_path = Path(self.tokenizer_model_folder).absolute() / f"{self.language}.pickle"

View File

@ -50,7 +50,6 @@ dependencies = [
"pydantic",
"transformers[torch]==4.25.1",
"protobuf<=3.20.2", # same version they use in transformers[sentencepiece]
"nltk",
"pandas",
"rank_bm25",
"scikit-learn>=1.0.0", # TF-IDF, SklearnQueryClassifier and metrics
@ -67,16 +66,11 @@ dependencies = [
"huggingface-hub>=0.5.0",
"tenacity", # retry decorator
"sseclient-py", # server side events for OpenAI streaming
"more_itertools", # utilities
# Web Retriever
"boilerpy3",
# Preprocessing
"more_itertools", # for windowing
"python-docx",
"langdetect", # for PDF conversions
"tika", # Apache Tika (text & metadata extractor)
# See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder
"sentence-transformers>=2.2.0",
@ -157,6 +151,12 @@ crawler = [
"webdriver-manager",
]
preprocessing = [
"nltk",
"langdetect", # for language classification
]
file-conversion = [
"python-docx",
"tika", # Apache Tika (text & metadata extractor)
"beautifulsoup4",
"markdown",
"python-frontmatter",
@ -224,11 +224,11 @@ formatting = [
]
all = [
"farm-haystack[docstores,audio,crawler,preprocessing,pdf,ocr,ray,dev,onnx,beir,metrics]",
"farm-haystack[docstores,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,dev,onnx,beir,metrics]",
]
all-gpu = [
# beir is incompatible with faiss-gpu: https://github.com/beir-cellar/beir/issues/71
"farm-haystack[docstores-gpu,audio,crawler,preprocessing,pdf,ocr,ray,dev,onnx-gpu,metrics]",
"farm-haystack[docstores-gpu,audio,crawler,preprocessing,file-conversion,pdf,ocr,ray,dev,onnx-gpu,metrics]",
]
[project.scripts]