diff --git a/.github/workflows/imports.yml b/.github/workflows/imports.yml index 20df21ff5..3c102de3e 100644 --- a/.github/workflows/imports.yml +++ b/.github/workflows/imports.yml @@ -37,5 +37,5 @@ jobs: - name: Install Haystack with no extras run: pip install . - - name: Try to import + - name: Import Haystack run: python -c 'import haystack' diff --git a/haystack/__init__.py b/haystack/__init__.py index 5da725e7e..b527cee87 100644 --- a/haystack/__init__.py +++ b/haystack/__init__.py @@ -1,21 +1,94 @@ -# pylint: disable=wrong-import-position,wrong-import-order +# pylint: disable=wrong-import-position +# Logging is not configured here on purpose, see https://github.com/deepset-ai/haystack/issues/2485 -from typing import Union -from types import ModuleType - -try: - from importlib import metadata -except (ModuleNotFoundError, ImportError): - # Python <= 3.7 - import importlib_metadata as metadata # type: ignore +import sys +from importlib import metadata __version__: str = str(metadata.version("farm-haystack")) +from generalimport import generalimport, MissingOptionalDependency, FakeModule -# Logging is not configured here on purpose, see https://github.com/deepset-ai/haystack/issues/2485 -import logging +generalimport( + # "pydantic", # Required for all dataclasses + # "tenacity", # Probably needed because it's a decorator, to be evaluated + # "pandas", + "aiorwlock", + "azure", + "beautifulsoup4", + "beir", + "boilerpy3", + "canals", + "dill", + "docx", + "elasticsearch", + "events", + "faiss", + "fitz", + "frontmatter", + "huggingface_hub", + "jsonschema", + "langdetect", + "magic", + "markdown", + "mlflow", + "mmh3", + "more_itertools", + "networkx", + "nltk", + "numpy", + "onnxruntime", + "onnxruntime_tools", + "opensearchpy", + "pdf2image", + "PIL", + "pinecone", + "posthog", + "protobuf", + "psycopg2", + "pymilvus", + "pytesseract", + "quantulum3", + "rank_bm25", + "rapidfuzz", + "ray", + "rdflib", + "requests", + "scipy", + "selenium", + "sentence_transformers", + "seqeval", + "sklearn", + "SPARQLWrapper", + "sqlalchemy", + "sseclient", + "tenacity", + "tika", + "tiktoken", + "tokenizers", + "torch", + "tqdm", + "transformers", + "weaviate", + "webdriver_manager", + "whisper", + "yaml", +) + + +# TODO: remove this function once this PR is merged and released by generalimport: +# https://github.com/ManderaGeneral/generalimport/pull/25 +def is_imported(module_name: str) -> bool: + """ + Returns True if the module was actually imported, False, if generalimport mocked it. + """ + module = sys.modules.get(module_name) + try: + return bool(module) and not isinstance(module, FakeModule) + except MissingOptionalDependency: + # isinstance() raises MissingOptionalDependency: fake module + pass + return False -import pandas as pd from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationResult, TableCell from haystack.nodes.base import BaseComponent @@ -23,5 +96,6 @@ from haystack.pipelines.base import Pipeline from haystack.environment import set_pytorch_secure_model_loading -pd.options.display.max_colwidth = 80 +# Enables torch's secure model loading through setting an env var. +# Does not use torch. set_pytorch_secure_model_loading() diff --git a/haystack/document_stores/faiss.py b/haystack/document_stores/faiss.py index bc4d4f8e2..a787a8cc9 100644 --- a/haystack/document_stores/faiss.py +++ b/haystack/document_stores/faiss.py @@ -46,7 +46,7 @@ class FAISSDocumentStore(SQLDocumentStore): vector_dim: Optional[int] = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", - faiss_index: Optional[faiss.swigfaiss.Index] = None, + faiss_index: Optional["faiss.swigfaiss.Index"] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", diff --git a/haystack/document_stores/pinecone.py b/haystack/document_stores/pinecone.py index 059b83e10..e600b617c 100644 --- a/haystack/document_stores/pinecone.py +++ b/haystack/document_stores/pinecone.py @@ -58,7 +58,7 @@ class PineconeDocumentStore(BaseDocumentStore): self, api_key: str, environment: str = "us-west1-gcp", - pinecone_index: Optional[pinecone.Index] = None, + pinecone_index: Optional["pinecone.Index"] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", diff --git a/haystack/document_stores/sql.py b/haystack/document_stores/sql.py index 2de165bae..f55eb02e8 100644 --- a/haystack/document_stores/sql.py +++ b/haystack/document_stores/sql.py @@ -27,103 +27,115 @@ try: from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import relationship, sessionmaker, aliased from sqlalchemy.sql import case, null + except (ImportError, ModuleNotFoundError) as ie: from haystack.utils.import_utils import _optional_component_not_installed _optional_component_not_installed(__name__, "sql", ie) + +from haystack import is_imported from haystack.schema import Document, Label, Answer from haystack.document_stores.base import BaseDocumentStore, FilterType from haystack.document_stores.filter_utils import LogicalFilterClause +if not is_imported("sqlalchemy"): + Base = object + ArrayType = object + ORMBase = object + DocumentORM = object + MetaDocumentORM = object + LabelORM = object + MetaLabelORM = object + +else: + Base = declarative_base() # type: Any + + class ArrayType(TypeDecorator): + impl = String + cache_ok = True + + def process_bind_param(self, value, dialect): + return json.dumps(value) + + def process_result_value(self, value, dialect): + if value is not None: + return json.loads(value) + return value + + class ORMBase(Base): + __abstract__ = True + + id = Column(String(100), default=lambda: str(uuid4()), primary_key=True) + created_at = Column(DateTime, server_default=func.now()) + updated_at = Column(DateTime, server_default=func.now(), server_onupdate=func.now()) + + class DocumentORM(ORMBase): + __tablename__ = "document" + + content = Column(JSON, nullable=False) + content_type = Column(Text, nullable=True) + # primary key in combination with id to allow the same doc in different indices + index = Column(String(100), nullable=False, primary_key=True) + vector_id = Column(String(100), nullable=True) + # speeds up queries for get_documents_by_vector_ids() by having a single query that returns joined metadata + meta = relationship("MetaDocumentORM", back_populates="documents", lazy="joined") + + __table_args__ = (UniqueConstraint("index", "vector_id", name="index_vector_id_uc"),) + + class MetaDocumentORM(ORMBase): + __tablename__ = "meta_document" + + name = Column(String(100), index=True) + value = Column(ArrayType(1000), index=True) + documents = relationship("DocumentORM", back_populates="meta") + + document_id = Column(String(100), nullable=False, index=True) + document_index = Column(String(100), nullable=False, index=True) + __table_args__ = ( # type: ignore + ForeignKeyConstraint( + [document_id, document_index], + [DocumentORM.id, DocumentORM.index], + ondelete="CASCADE", + onupdate="CASCADE", + ), + {}, + ) + + class LabelORM(ORMBase): + __tablename__ = "label" + + index = Column(String(100), nullable=False, primary_key=True) + query = Column(Text, nullable=False) + answer = Column(JSON, nullable=True) + document = Column(JSON, nullable=False) + no_answer = Column(Boolean, nullable=False) + origin = Column(String(100), nullable=False) + is_correct_answer = Column(Boolean, nullable=False) + is_correct_document = Column(Boolean, nullable=False) + pipeline_id = Column(String(500), nullable=True) + + meta = relationship("MetaLabelORM", back_populates="labels", lazy="joined") + + class MetaLabelORM(ORMBase): + __tablename__ = "meta_label" + + name = Column(String(100), index=True) + value = Column(String(1000), index=True) + labels = relationship("LabelORM", back_populates="meta") + + label_id = Column(String(100), nullable=False, index=True) + label_index = Column(String(100), nullable=False, index=True) + __table_args__ = ( # type: ignore + ForeignKeyConstraint( + [label_id, label_index], [LabelORM.id, LabelORM.index], ondelete="CASCADE", onupdate="CASCADE" + ), + {}, + ) + + logger = logging.getLogger(__name__) -Base = declarative_base() # type: Any - - -class ArrayType(TypeDecorator): - impl = String - cache_ok = True - - def process_bind_param(self, value, dialect): - return json.dumps(value) - - def process_result_value(self, value, dialect): - if value is not None: - return json.loads(value) - return value - - -class ORMBase(Base): - __abstract__ = True - - id = Column(String(100), default=lambda: str(uuid4()), primary_key=True) - created_at = Column(DateTime, server_default=func.now()) - updated_at = Column(DateTime, server_default=func.now(), server_onupdate=func.now()) - - -class DocumentORM(ORMBase): - __tablename__ = "document" - - content = Column(JSON, nullable=False) - content_type = Column(Text, nullable=True) - # primary key in combination with id to allow the same doc in different indices - index = Column(String(100), nullable=False, primary_key=True) - vector_id = Column(String(100), nullable=True) - # speeds up queries for get_documents_by_vector_ids() by having a single query that returns joined metadata - meta = relationship("MetaDocumentORM", back_populates="documents", lazy="joined") - - __table_args__ = (UniqueConstraint("index", "vector_id", name="index_vector_id_uc"),) - - -class MetaDocumentORM(ORMBase): - __tablename__ = "meta_document" - - name = Column(String(100), index=True) - value = Column(ArrayType(1000), index=True) - documents = relationship("DocumentORM", back_populates="meta") - - document_id = Column(String(100), nullable=False, index=True) - document_index = Column(String(100), nullable=False, index=True) - __table_args__ = ( # type: ignore - ForeignKeyConstraint( - [document_id, document_index], [DocumentORM.id, DocumentORM.index], ondelete="CASCADE", onupdate="CASCADE" - ), - {}, - ) - - -class LabelORM(ORMBase): - __tablename__ = "label" - - index = Column(String(100), nullable=False, primary_key=True) - query = Column(Text, nullable=False) - answer = Column(JSON, nullable=True) - document = Column(JSON, nullable=False) - no_answer = Column(Boolean, nullable=False) - origin = Column(String(100), nullable=False) - is_correct_answer = Column(Boolean, nullable=False) - is_correct_document = Column(Boolean, nullable=False) - pipeline_id = Column(String(500), nullable=True) - - meta = relationship("MetaLabelORM", back_populates="labels", lazy="joined") - - -class MetaLabelORM(ORMBase): - __tablename__ = "meta_label" - - name = Column(String(100), index=True) - value = Column(String(1000), index=True) - labels = relationship("LabelORM", back_populates="meta") - - label_id = Column(String(100), nullable=False, index=True) - label_index = Column(String(100), nullable=False, index=True) - __table_args__ = ( # type: ignore - ForeignKeyConstraint( - [label_id, label_index], [LabelORM.id, LabelORM.index], ondelete="CASCADE", onupdate="CASCADE" - ), - {}, - ) class SQLDocumentStore(BaseDocumentStore): diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 3d12376fe..1def4ff4a 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -41,7 +41,7 @@ class BiAdaptiveModel(nn.Module): language_model2: LanguageModel, prediction_heads: List[PredictionHead], embeds_dropout_prob: float = 0.1, - device: torch.device = torch.device("cuda"), + device: Optional[torch.device] = None, lm1_output_types: Optional[Union[str, List[str]]] = None, lm2_output_types: Optional[Union[str, List[str]]] = None, loss_aggregation_fn: Optional[Callable] = None, @@ -74,6 +74,9 @@ class BiAdaptiveModel(nn.Module): Note: The loss at this stage is per sample, i.e one tensor of shape (batchsize) per prediction head. """ + if not device: + device = torch.device("cuda") + if lm1_output_types is None: lm1_output_types = ["per_sequence"] if lm2_output_types is None: diff --git a/haystack/modeling/model/feature_extraction.py b/haystack/modeling/model/feature_extraction.py index 0cd4024a6..526a48970 100644 --- a/haystack/modeling/model/feature_extraction.py +++ b/haystack/modeling/model/feature_extraction.py @@ -29,6 +29,7 @@ from transformers import PreTrainedTokenizer, RobertaTokenizer, AutoConfig, Auto from transformers.models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING_NAMES from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES +from haystack import is_imported from haystack.errors import ModelingError from haystack.modeling.data_handler.samples import SampleBasket @@ -40,12 +41,16 @@ logger = logging.getLogger(__name__) SPECIAL_TOKENIZER_CHARS = r"^(##|Ġ|▁)" +if not is_imported("transformers"): + TOKENIZER_MAPPING_NAMES = {} + FEATURE_EXTRACTOR_MAPPING_NAMES = {} + + FEATURE_EXTRACTORS = { **{key: AutoTokenizer for key in TOKENIZER_MAPPING_NAMES.keys()}, **{key: AutoFeatureExtractor for key in FEATURE_EXTRACTOR_MAPPING_NAMES.keys()}, } - DEFAULT_EXTRACTION_PARAMS = { AutoTokenizer: { "max_length": 256, diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 6c649b3c5..20f939257 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -43,7 +43,7 @@ class TriAdaptiveModel(nn.Module): language_model3: LanguageModel, prediction_heads: List[PredictionHead], embeds_dropout_prob: float = 0.1, - device: torch.device = torch.device("cuda"), + device: Optional[torch.device] = None, lm1_output_types: Optional[Union[str, List[str]]] = None, lm2_output_types: Optional[Union[str, List[str]]] = None, lm3_output_types: Optional[Union[str, List[str]]] = None, @@ -83,6 +83,9 @@ class TriAdaptiveModel(nn.Module): Note: The loss at this stage is per sample, i.e one tensor of shape (batchsize) per prediction head. """ + if not device: + device = torch.device("cuda") + if lm1_output_types is None: lm1_output_types = ["per_sequence"] if lm2_output_types is None: diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py index 97cd2a6bd..9a98615de 100644 --- a/haystack/nodes/_json_schema.py +++ b/haystack/nodes/_json_schema.py @@ -55,6 +55,10 @@ def field_singleton_schema( known_models: TypeModelSet, ) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]: try: + # Typing with optional dependencies is really tricky. Let's just use Any for now. To be fixed. + if isinstance(field.type_, ForwardRef): + logger.debug(field.type_) + field.type_ = Any return _field_singleton_schema( field, by_alias=by_alias, @@ -211,7 +215,10 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[ # Create the model with Pydantic and extract the schema model = create_model(f"{node_name}ComponentParams", __config__=Config, **param_fields_kwargs) - model.update_forward_refs(**model.__dict__) + try: + model.update_forward_refs(**model.__dict__) + except NameError as exc: + logger.debug("%s", str(exc)) params_schema = model.schema() # Pydantic v1 patch to generate JSON schemas including Optional fields diff --git a/haystack/schema.py b/haystack/schema.py index ce192a849..6b422ef73 100644 --- a/haystack/schema.py +++ b/haystack/schema.py @@ -20,7 +20,9 @@ from dataclasses import asdict import mmh3 import numpy as np +from numpy import ndarray import pandas as pd +from pandas import DataFrame from pydantic import BaseConfig, Field from pydantic.json import pydantic_encoder @@ -29,12 +31,19 @@ from pydantic.json import pydantic_encoder # See #1598 for the reasons behind this choice & performance considerations from pydantic.dataclasses import dataclass +from haystack import is_imported + logger = logging.getLogger(__name__) +if not is_imported("pandas"): + DataFrame = object + + BaseConfig.arbitrary_types_allowed = True + #: Types of content_types supported ContentTypes = Literal["text", "table", "image", "audio"] FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]] @@ -43,12 +52,12 @@ FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]] @dataclass class Document: id: str - content: Union[str, pd.DataFrame] + content: Union[str, DataFrame] content_type: ContentTypes = Field(default="text") meta: Dict[str, Any] = Field(default={}) id_hash_keys: List[str] = Field(default=["content"]) score: Optional[float] = None - embedding: Optional[np.ndarray] = None + embedding: Optional[ndarray] = None # We use a custom init here as we want some custom logic. The annotations above are however still needed in order # to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method @@ -56,12 +65,12 @@ class Document: # don't need to passed by the user in init and are rather initialized automatically in the init def __init__( self, - content: Union[str, pd.DataFrame], + content: Union[str, DataFrame], content_type: ContentTypes = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Optional[Dict[str, Any]] = None, - embedding: Optional[np.ndarray] = None, + embedding: Optional[ndarray] = None, id_hash_keys: Optional[List[str]] = None, ): """ @@ -184,7 +193,7 @@ class Document: continue if k == "content": # Convert pd.DataFrame to list of rows for serialization - if self.content_type == "table" and isinstance(self.content, pd.DataFrame): + if self.content_type == "table" and isinstance(self.content, DataFrame): v = dataframe_to_list(self.content) k = k if k not in inv_field_map else inv_field_map[k] _doc[k] = v @@ -230,7 +239,7 @@ class Document: k = field_map[k] _new_doc[k] = v - # Convert list of rows to pd.DataFrame + # Convert list of rows to DataFrame if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list): _new_doc["content"] = dataframe_from_list(_new_doc["content"]) @@ -358,7 +367,7 @@ class Answer: answer: str type: Literal["generative", "extractive", "other"] = "extractive" score: Optional[float] = None - context: Optional[Union[str, pd.DataFrame]] = None + context: Optional[Union[str, DataFrame]] = None offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None document_ids: Optional[List[str]] = None @@ -832,7 +841,7 @@ def dataframe_from_list(list_df: List[List]) -> pd.DataFrame: class EvaluationResult: - def __init__(self, node_results: Optional[Dict[str, pd.DataFrame]] = None) -> None: + def __init__(self, node_results: Optional[Dict[str, DataFrame]] = None) -> None: """ A convenience class to store, pass, and interact with results of a pipeline evaluation run (for example `pipeline.eval()`). Detailed results are stored as one dataframe per node. This class makes them more accessible and provides @@ -902,7 +911,7 @@ class EvaluationResult: :param node_results: The evaluation Dataframes per pipeline node. """ - self.node_results: Dict[str, pd.DataFrame] = {} if node_results is None else node_results + self.node_results: Dict[str, DataFrame] = {} if node_results is None else node_results def __getitem__(self, key: str): return self.node_results.__getitem__(key) @@ -910,7 +919,7 @@ class EvaluationResult: def __delitem__(self, key: str): self.node_results.__delitem__(key) - def __setitem__(self, key: str, value: pd.DataFrame): + def __setitem__(self, key: str, value: DataFrame): self.node_results.__setitem__(key, value) def __contains__(self, key: str): @@ -919,7 +928,7 @@ class EvaluationResult: def __len__(self): return self.node_results.__len__() - def append(self, key: str, value: pd.DataFrame): + def append(self, key: str, value: DataFrame): if value is not None and len(value) > 0: if key in self.node_results: self.node_results[key] = pd.concat([self.node_results[key], value]) @@ -1210,7 +1219,7 @@ class EvaluationResult: def _calculate_node_metrics( self, - df: pd.DataFrame, + df: DataFrame, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[ @@ -1244,7 +1253,7 @@ class EvaluationResult: return {**answer_metrics, **document_metrics} - def _filter_eval_mode(self, df: pd.DataFrame, eval_mode: str) -> pd.DataFrame: + def _filter_eval_mode(self, df: DataFrame, eval_mode: str) -> DataFrame: if "eval_mode" in df.columns: df = df[df["eval_mode"] == eval_mode] else: @@ -1253,7 +1262,7 @@ class EvaluationResult: def _calculate_answer_metrics( self, - df: pd.DataFrame, + df: DataFrame, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", @@ -1275,11 +1284,11 @@ class EvaluationResult: def _build_answer_metrics_df( self, - answers: pd.DataFrame, + answers: DataFrame, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", - ) -> pd.DataFrame: + ) -> DataFrame: """ Builds a dataframe containing answer metrics (columns) per multilabel (index). Answer metrics are: @@ -1335,7 +1344,7 @@ class EvaluationResult: } df_records.append(query_metrics) - metrics_df = pd.DataFrame.from_records(df_records, index=multilabel_ids) + metrics_df = DataFrame.from_records(df_records, index=multilabel_ids) return metrics_df def _get_documents_df(self): @@ -1350,7 +1359,7 @@ class EvaluationResult: def _calculate_document_metrics( self, - df: pd.DataFrame, + df: DataFrame, simulated_top_k_retriever: int = -1, document_relevance_criterion: Literal[ "document_id", @@ -1378,7 +1387,7 @@ class EvaluationResult: def _build_document_metrics_df( self, - documents: pd.DataFrame, + documents: DataFrame, simulated_top_k_retriever: int = -1, document_relevance_criterion: Literal[ "document_id", @@ -1391,7 +1400,7 @@ class EvaluationResult: "document_id_and_context_and_answer", "document_id_or_answer", ] = "document_id_or_answer", - ) -> pd.DataFrame: + ) -> DataFrame: """ Builds a dataframe containing document metrics (columns) per pair of query and gold document ids (index). Document metrics are: @@ -1539,7 +1548,7 @@ class EvaluationResult: } ) - metrics_df = pd.DataFrame.from_records(metrics, index=documents["multilabel_id"].unique()) + metrics_df = DataFrame.from_records(metrics, index=documents["multilabel_id"].unique()) return metrics_df def save(self, out_dir: Union[str, Path], **to_csv_kwargs): @@ -1548,8 +1557,8 @@ class EvaluationResult: The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder. :param out_dir: Path to the target folder the csvs will be saved. - :param to_csv_kwargs: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html. - This method uses different default values than pd.DataFrame.to_csv() for the following parameters: + :param to_csv_kwargs: kwargs to be passed to DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html. + This method uses different default values than DataFrame.to_csv() for the following parameters: index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars) """ out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir) diff --git a/haystack/utils/import_utils.py b/haystack/utils/import_utils.py index fce3dc22c..915fe3446 100644 --- a/haystack/utils/import_utils.py +++ b/haystack/utils/import_utils.py @@ -9,6 +9,7 @@ from pathlib import Path from typing import Optional, Dict, Union, Tuple, List import requests + from haystack.errors import DatasetsError from haystack.schema import Document diff --git a/haystack/utils/openai_utils.py b/haystack/utils/openai_utils.py index eccb5676c..b3d0d552c 100644 --- a/haystack/utils/openai_utils.py +++ b/haystack/utils/openai_utils.py @@ -6,7 +6,7 @@ import sys import json from typing import Dict, Union, Tuple, Optional, List import requests -from tenacity import retry, retry_if_exception_type, wait_exponential, stop_after_attempt +import tenacity from transformers import GPT2TokenizerFast from haystack.errors import OpenAIError, OpenAIRateLimitError, OpenAIUnauthorizedError @@ -127,10 +127,10 @@ def _openai_text_completion_tokenization_details(model_name: str): return tokenizer_name, max_tokens_limit -@retry( - retry=retry_if_exception_type(OpenAIRateLimitError), - wait=wait_exponential(multiplier=OPENAI_BACKOFF), - stop=stop_after_attempt(OPENAI_MAX_RETRIES), +@tenacity.retry( + retry=tenacity.retry_if_exception_type(OpenAIRateLimitError), + wait=tenacity.wait_exponential(multiplier=OPENAI_BACKOFF), + stop=tenacity.stop_after_attempt(OPENAI_MAX_RETRIES), ) def openai_request( url: str, diff --git a/haystack/utils/squad_data.py b/haystack/utils/squad_data.py index 80c0e67dc..80ee9ac45 100644 --- a/haystack/utils/squad_data.py +++ b/haystack/utils/squad_data.py @@ -7,6 +7,7 @@ import pandas as pd from tqdm.auto import tqdm import mmh3 +from haystack import is_imported from haystack.schema import Document, Label, Answer from haystack.modeling.data_handler.processor import _read_squad_file @@ -14,7 +15,8 @@ from haystack.modeling.data_handler.processor import _read_squad_file logger = logging.getLogger(__name__) -tqdm.pandas() +if is_imported("pandas") and is_imported("tqdm"): + tqdm.pandas() COLUMN_NAMES = ["title", "context", "question", "id", "answer_text", "answer_start", "is_impossible"] diff --git a/pyproject.toml b/pyproject.toml index 6b54a9235..5ef881b09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ dependencies = [ "pandas", "rank_bm25", "scikit-learn>=1.0.0", # TF-IDF, SklearnQueryClassifier and metrics + "generalimport", # Optional imports # Utils "dill", # pickle extension for (de-)serialization