mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-15 19:23:25 +00:00
feat: introduce generalimport
(#4662)
* introduce generalimport * pylint * fix optional deps typing for schema * leftover * typo * typing with faiss * make Base generation optional too * handle sqlalchemy * (almost) all import are optional * TO REMOVE hijacking CI for tests * some deps are actually needed * get feature branch in CI * get feature branch in CI * fix array_equal * pylint * pandas also required * improve imports.yml * fix SquadData * fix SquadData again * generalimport imports list * Update haystack/utils/openai_utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Update haystack/utils/openai_utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * review feedback * remove todos * reference main release * pylint * circular import * review feedback * move is_imported in init * pylint --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
This commit is contained in:
parent
5b2ef2afd6
commit
28260c5c3f
2
.github/workflows/imports.yml
vendored
2
.github/workflows/imports.yml
vendored
@ -37,5 +37,5 @@ jobs:
|
||||
- name: Install Haystack with no extras
|
||||
run: pip install .
|
||||
|
||||
- name: Try to import
|
||||
- name: Import Haystack
|
||||
run: python -c 'import haystack'
|
||||
|
@ -1,21 +1,94 @@
|
||||
# pylint: disable=wrong-import-position,wrong-import-order
|
||||
# pylint: disable=wrong-import-position
|
||||
# Logging is not configured here on purpose, see https://github.com/deepset-ai/haystack/issues/2485
|
||||
|
||||
from typing import Union
|
||||
from types import ModuleType
|
||||
|
||||
try:
|
||||
import sys
|
||||
from importlib import metadata
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
# Python <= 3.7
|
||||
import importlib_metadata as metadata # type: ignore
|
||||
|
||||
__version__: str = str(metadata.version("farm-haystack"))
|
||||
|
||||
from generalimport import generalimport, MissingOptionalDependency, FakeModule
|
||||
|
||||
# Logging is not configured here on purpose, see https://github.com/deepset-ai/haystack/issues/2485
|
||||
import logging
|
||||
generalimport(
|
||||
# "pydantic", # Required for all dataclasses
|
||||
# "tenacity", # Probably needed because it's a decorator, to be evaluated
|
||||
# "pandas",
|
||||
"aiorwlock",
|
||||
"azure",
|
||||
"beautifulsoup4",
|
||||
"beir",
|
||||
"boilerpy3",
|
||||
"canals",
|
||||
"dill",
|
||||
"docx",
|
||||
"elasticsearch",
|
||||
"events",
|
||||
"faiss",
|
||||
"fitz",
|
||||
"frontmatter",
|
||||
"huggingface_hub",
|
||||
"jsonschema",
|
||||
"langdetect",
|
||||
"magic",
|
||||
"markdown",
|
||||
"mlflow",
|
||||
"mmh3",
|
||||
"more_itertools",
|
||||
"networkx",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"onnxruntime",
|
||||
"onnxruntime_tools",
|
||||
"opensearchpy",
|
||||
"pdf2image",
|
||||
"PIL",
|
||||
"pinecone",
|
||||
"posthog",
|
||||
"protobuf",
|
||||
"psycopg2",
|
||||
"pymilvus",
|
||||
"pytesseract",
|
||||
"quantulum3",
|
||||
"rank_bm25",
|
||||
"rapidfuzz",
|
||||
"ray",
|
||||
"rdflib",
|
||||
"requests",
|
||||
"scipy",
|
||||
"selenium",
|
||||
"sentence_transformers",
|
||||
"seqeval",
|
||||
"sklearn",
|
||||
"SPARQLWrapper",
|
||||
"sqlalchemy",
|
||||
"sseclient",
|
||||
"tenacity",
|
||||
"tika",
|
||||
"tiktoken",
|
||||
"tokenizers",
|
||||
"torch",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"weaviate",
|
||||
"webdriver_manager",
|
||||
"whisper",
|
||||
"yaml",
|
||||
)
|
||||
|
||||
|
||||
# TODO: remove this function once this PR is merged and released by generalimport:
|
||||
# https://github.com/ManderaGeneral/generalimport/pull/25
|
||||
def is_imported(module_name: str) -> bool:
|
||||
"""
|
||||
Returns True if the module was actually imported, False, if generalimport mocked it.
|
||||
"""
|
||||
module = sys.modules.get(module_name)
|
||||
try:
|
||||
return bool(module) and not isinstance(module, FakeModule)
|
||||
except MissingOptionalDependency:
|
||||
# isinstance() raises MissingOptionalDependency: fake module
|
||||
pass
|
||||
return False
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from haystack.schema import Document, Answer, Label, MultiLabel, Span, EvaluationResult, TableCell
|
||||
from haystack.nodes.base import BaseComponent
|
||||
@ -23,5 +96,6 @@ from haystack.pipelines.base import Pipeline
|
||||
from haystack.environment import set_pytorch_secure_model_loading
|
||||
|
||||
|
||||
pd.options.display.max_colwidth = 80
|
||||
# Enables torch's secure model loading through setting an env var.
|
||||
# Does not use torch.
|
||||
set_pytorch_secure_model_loading()
|
||||
|
@ -46,7 +46,7 @@ class FAISSDocumentStore(SQLDocumentStore):
|
||||
vector_dim: Optional[int] = None,
|
||||
embedding_dim: int = 768,
|
||||
faiss_index_factory_str: str = "Flat",
|
||||
faiss_index: Optional[faiss.swigfaiss.Index] = None,
|
||||
faiss_index: Optional["faiss.swigfaiss.Index"] = None,
|
||||
return_embedding: bool = False,
|
||||
index: str = "document",
|
||||
similarity: str = "dot_product",
|
||||
|
@ -58,7 +58,7 @@ class PineconeDocumentStore(BaseDocumentStore):
|
||||
self,
|
||||
api_key: str,
|
||||
environment: str = "us-west1-gcp",
|
||||
pinecone_index: Optional[pinecone.Index] = None,
|
||||
pinecone_index: Optional["pinecone.Index"] = None,
|
||||
embedding_dim: int = 768,
|
||||
return_embedding: bool = False,
|
||||
index: str = "document",
|
||||
|
@ -27,19 +27,30 @@ try:
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import relationship, sessionmaker, aliased
|
||||
from sqlalchemy.sql import case, null
|
||||
|
||||
except (ImportError, ModuleNotFoundError) as ie:
|
||||
from haystack.utils.import_utils import _optional_component_not_installed
|
||||
|
||||
_optional_component_not_installed(__name__, "sql", ie)
|
||||
|
||||
|
||||
from haystack import is_imported
|
||||
from haystack.schema import Document, Label, Answer
|
||||
from haystack.document_stores.base import BaseDocumentStore, FilterType
|
||||
from haystack.document_stores.filter_utils import LogicalFilterClause
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
Base = declarative_base() # type: Any
|
||||
if not is_imported("sqlalchemy"):
|
||||
Base = object
|
||||
ArrayType = object
|
||||
ORMBase = object
|
||||
DocumentORM = object
|
||||
MetaDocumentORM = object
|
||||
LabelORM = object
|
||||
MetaLabelORM = object
|
||||
|
||||
else:
|
||||
Base = declarative_base() # type: Any
|
||||
|
||||
class ArrayType(TypeDecorator):
|
||||
impl = String
|
||||
@ -53,7 +64,6 @@ class ArrayType(TypeDecorator):
|
||||
return json.loads(value)
|
||||
return value
|
||||
|
||||
|
||||
class ORMBase(Base):
|
||||
__abstract__ = True
|
||||
|
||||
@ -61,7 +71,6 @@ class ORMBase(Base):
|
||||
created_at = Column(DateTime, server_default=func.now())
|
||||
updated_at = Column(DateTime, server_default=func.now(), server_onupdate=func.now())
|
||||
|
||||
|
||||
class DocumentORM(ORMBase):
|
||||
__tablename__ = "document"
|
||||
|
||||
@ -75,7 +84,6 @@ class DocumentORM(ORMBase):
|
||||
|
||||
__table_args__ = (UniqueConstraint("index", "vector_id", name="index_vector_id_uc"),)
|
||||
|
||||
|
||||
class MetaDocumentORM(ORMBase):
|
||||
__tablename__ = "meta_document"
|
||||
|
||||
@ -87,12 +95,14 @@ class MetaDocumentORM(ORMBase):
|
||||
document_index = Column(String(100), nullable=False, index=True)
|
||||
__table_args__ = ( # type: ignore
|
||||
ForeignKeyConstraint(
|
||||
[document_id, document_index], [DocumentORM.id, DocumentORM.index], ondelete="CASCADE", onupdate="CASCADE"
|
||||
[document_id, document_index],
|
||||
[DocumentORM.id, DocumentORM.index],
|
||||
ondelete="CASCADE",
|
||||
onupdate="CASCADE",
|
||||
),
|
||||
{},
|
||||
)
|
||||
|
||||
|
||||
class LabelORM(ORMBase):
|
||||
__tablename__ = "label"
|
||||
|
||||
@ -108,7 +118,6 @@ class LabelORM(ORMBase):
|
||||
|
||||
meta = relationship("MetaLabelORM", back_populates="labels", lazy="joined")
|
||||
|
||||
|
||||
class MetaLabelORM(ORMBase):
|
||||
__tablename__ = "meta_label"
|
||||
|
||||
@ -126,6 +135,9 @@ class MetaLabelORM(ORMBase):
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SQLDocumentStore(BaseDocumentStore):
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -41,7 +41,7 @@ class BiAdaptiveModel(nn.Module):
|
||||
language_model2: LanguageModel,
|
||||
prediction_heads: List[PredictionHead],
|
||||
embeds_dropout_prob: float = 0.1,
|
||||
device: torch.device = torch.device("cuda"),
|
||||
device: Optional[torch.device] = None,
|
||||
lm1_output_types: Optional[Union[str, List[str]]] = None,
|
||||
lm2_output_types: Optional[Union[str, List[str]]] = None,
|
||||
loss_aggregation_fn: Optional[Callable] = None,
|
||||
@ -74,6 +74,9 @@ class BiAdaptiveModel(nn.Module):
|
||||
Note: The loss at this stage is per sample, i.e one tensor of
|
||||
shape (batchsize) per prediction head.
|
||||
"""
|
||||
if not device:
|
||||
device = torch.device("cuda")
|
||||
|
||||
if lm1_output_types is None:
|
||||
lm1_output_types = ["per_sequence"]
|
||||
if lm2_output_types is None:
|
||||
|
@ -29,6 +29,7 @@ from transformers import PreTrainedTokenizer, RobertaTokenizer, AutoConfig, Auto
|
||||
from transformers.models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING_NAMES
|
||||
from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES
|
||||
|
||||
from haystack import is_imported
|
||||
from haystack.errors import ModelingError
|
||||
from haystack.modeling.data_handler.samples import SampleBasket
|
||||
|
||||
@ -40,12 +41,16 @@ logger = logging.getLogger(__name__)
|
||||
SPECIAL_TOKENIZER_CHARS = r"^(##|Ġ|▁)"
|
||||
|
||||
|
||||
if not is_imported("transformers"):
|
||||
TOKENIZER_MAPPING_NAMES = {}
|
||||
FEATURE_EXTRACTOR_MAPPING_NAMES = {}
|
||||
|
||||
|
||||
FEATURE_EXTRACTORS = {
|
||||
**{key: AutoTokenizer for key in TOKENIZER_MAPPING_NAMES.keys()},
|
||||
**{key: AutoFeatureExtractor for key in FEATURE_EXTRACTOR_MAPPING_NAMES.keys()},
|
||||
}
|
||||
|
||||
|
||||
DEFAULT_EXTRACTION_PARAMS = {
|
||||
AutoTokenizer: {
|
||||
"max_length": 256,
|
||||
|
@ -43,7 +43,7 @@ class TriAdaptiveModel(nn.Module):
|
||||
language_model3: LanguageModel,
|
||||
prediction_heads: List[PredictionHead],
|
||||
embeds_dropout_prob: float = 0.1,
|
||||
device: torch.device = torch.device("cuda"),
|
||||
device: Optional[torch.device] = None,
|
||||
lm1_output_types: Optional[Union[str, List[str]]] = None,
|
||||
lm2_output_types: Optional[Union[str, List[str]]] = None,
|
||||
lm3_output_types: Optional[Union[str, List[str]]] = None,
|
||||
@ -83,6 +83,9 @@ class TriAdaptiveModel(nn.Module):
|
||||
Note: The loss at this stage is per sample, i.e one tensor of
|
||||
shape (batchsize) per prediction head.
|
||||
"""
|
||||
if not device:
|
||||
device = torch.device("cuda")
|
||||
|
||||
if lm1_output_types is None:
|
||||
lm1_output_types = ["per_sequence"]
|
||||
if lm2_output_types is None:
|
||||
|
@ -55,6 +55,10 @@ def field_singleton_schema(
|
||||
known_models: TypeModelSet,
|
||||
) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
|
||||
try:
|
||||
# Typing with optional dependencies is really tricky. Let's just use Any for now. To be fixed.
|
||||
if isinstance(field.type_, ForwardRef):
|
||||
logger.debug(field.type_)
|
||||
field.type_ = Any
|
||||
return _field_singleton_schema(
|
||||
field,
|
||||
by_alias=by_alias,
|
||||
@ -211,7 +215,10 @@ def create_schema_for_node_class(node_class: Type[BaseComponent]) -> Tuple[Dict[
|
||||
|
||||
# Create the model with Pydantic and extract the schema
|
||||
model = create_model(f"{node_name}ComponentParams", __config__=Config, **param_fields_kwargs)
|
||||
try:
|
||||
model.update_forward_refs(**model.__dict__)
|
||||
except NameError as exc:
|
||||
logger.debug("%s", str(exc))
|
||||
params_schema = model.schema()
|
||||
|
||||
# Pydantic v1 patch to generate JSON schemas including Optional fields
|
||||
|
@ -20,7 +20,9 @@ from dataclasses import asdict
|
||||
|
||||
import mmh3
|
||||
import numpy as np
|
||||
from numpy import ndarray
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
|
||||
from pydantic import BaseConfig, Field
|
||||
from pydantic.json import pydantic_encoder
|
||||
@ -29,12 +31,19 @@ from pydantic.json import pydantic_encoder
|
||||
# See #1598 for the reasons behind this choice & performance considerations
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from haystack import is_imported
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
if not is_imported("pandas"):
|
||||
DataFrame = object
|
||||
|
||||
|
||||
BaseConfig.arbitrary_types_allowed = True
|
||||
|
||||
|
||||
#: Types of content_types supported
|
||||
ContentTypes = Literal["text", "table", "image", "audio"]
|
||||
FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]
|
||||
@ -43,12 +52,12 @@ FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]
|
||||
@dataclass
|
||||
class Document:
|
||||
id: str
|
||||
content: Union[str, pd.DataFrame]
|
||||
content: Union[str, DataFrame]
|
||||
content_type: ContentTypes = Field(default="text")
|
||||
meta: Dict[str, Any] = Field(default={})
|
||||
id_hash_keys: List[str] = Field(default=["content"])
|
||||
score: Optional[float] = None
|
||||
embedding: Optional[np.ndarray] = None
|
||||
embedding: Optional[ndarray] = None
|
||||
|
||||
# We use a custom init here as we want some custom logic. The annotations above are however still needed in order
|
||||
# to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
|
||||
@ -56,12 +65,12 @@ class Document:
|
||||
# don't need to passed by the user in init and are rather initialized automatically in the init
|
||||
def __init__(
|
||||
self,
|
||||
content: Union[str, pd.DataFrame],
|
||||
content: Union[str, DataFrame],
|
||||
content_type: ContentTypes = "text",
|
||||
id: Optional[str] = None,
|
||||
score: Optional[float] = None,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
embedding: Optional[np.ndarray] = None,
|
||||
embedding: Optional[ndarray] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
@ -184,7 +193,7 @@ class Document:
|
||||
continue
|
||||
if k == "content":
|
||||
# Convert pd.DataFrame to list of rows for serialization
|
||||
if self.content_type == "table" and isinstance(self.content, pd.DataFrame):
|
||||
if self.content_type == "table" and isinstance(self.content, DataFrame):
|
||||
v = dataframe_to_list(self.content)
|
||||
k = k if k not in inv_field_map else inv_field_map[k]
|
||||
_doc[k] = v
|
||||
@ -230,7 +239,7 @@ class Document:
|
||||
k = field_map[k]
|
||||
_new_doc[k] = v
|
||||
|
||||
# Convert list of rows to pd.DataFrame
|
||||
# Convert list of rows to DataFrame
|
||||
if _new_doc.get("content_type", None) == "table" and isinstance(_new_doc["content"], list):
|
||||
_new_doc["content"] = dataframe_from_list(_new_doc["content"])
|
||||
|
||||
@ -358,7 +367,7 @@ class Answer:
|
||||
answer: str
|
||||
type: Literal["generative", "extractive", "other"] = "extractive"
|
||||
score: Optional[float] = None
|
||||
context: Optional[Union[str, pd.DataFrame]] = None
|
||||
context: Optional[Union[str, DataFrame]] = None
|
||||
offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None
|
||||
offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None
|
||||
document_ids: Optional[List[str]] = None
|
||||
@ -832,7 +841,7 @@ def dataframe_from_list(list_df: List[List]) -> pd.DataFrame:
|
||||
|
||||
|
||||
class EvaluationResult:
|
||||
def __init__(self, node_results: Optional[Dict[str, pd.DataFrame]] = None) -> None:
|
||||
def __init__(self, node_results: Optional[Dict[str, DataFrame]] = None) -> None:
|
||||
"""
|
||||
A convenience class to store, pass, and interact with results of a pipeline evaluation run (for example `pipeline.eval()`).
|
||||
Detailed results are stored as one dataframe per node. This class makes them more accessible and provides
|
||||
@ -902,7 +911,7 @@ class EvaluationResult:
|
||||
|
||||
:param node_results: The evaluation Dataframes per pipeline node.
|
||||
"""
|
||||
self.node_results: Dict[str, pd.DataFrame] = {} if node_results is None else node_results
|
||||
self.node_results: Dict[str, DataFrame] = {} if node_results is None else node_results
|
||||
|
||||
def __getitem__(self, key: str):
|
||||
return self.node_results.__getitem__(key)
|
||||
@ -910,7 +919,7 @@ class EvaluationResult:
|
||||
def __delitem__(self, key: str):
|
||||
self.node_results.__delitem__(key)
|
||||
|
||||
def __setitem__(self, key: str, value: pd.DataFrame):
|
||||
def __setitem__(self, key: str, value: DataFrame):
|
||||
self.node_results.__setitem__(key, value)
|
||||
|
||||
def __contains__(self, key: str):
|
||||
@ -919,7 +928,7 @@ class EvaluationResult:
|
||||
def __len__(self):
|
||||
return self.node_results.__len__()
|
||||
|
||||
def append(self, key: str, value: pd.DataFrame):
|
||||
def append(self, key: str, value: DataFrame):
|
||||
if value is not None and len(value) > 0:
|
||||
if key in self.node_results:
|
||||
self.node_results[key] = pd.concat([self.node_results[key], value])
|
||||
@ -1210,7 +1219,7 @@ class EvaluationResult:
|
||||
|
||||
def _calculate_node_metrics(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
df: DataFrame,
|
||||
simulated_top_k_reader: int = -1,
|
||||
simulated_top_k_retriever: int = -1,
|
||||
document_scope: Literal[
|
||||
@ -1244,7 +1253,7 @@ class EvaluationResult:
|
||||
|
||||
return {**answer_metrics, **document_metrics}
|
||||
|
||||
def _filter_eval_mode(self, df: pd.DataFrame, eval_mode: str) -> pd.DataFrame:
|
||||
def _filter_eval_mode(self, df: DataFrame, eval_mode: str) -> DataFrame:
|
||||
if "eval_mode" in df.columns:
|
||||
df = df[df["eval_mode"] == eval_mode]
|
||||
else:
|
||||
@ -1253,7 +1262,7 @@ class EvaluationResult:
|
||||
|
||||
def _calculate_answer_metrics(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
df: DataFrame,
|
||||
simulated_top_k_reader: int = -1,
|
||||
simulated_top_k_retriever: int = -1,
|
||||
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
||||
@ -1275,11 +1284,11 @@ class EvaluationResult:
|
||||
|
||||
def _build_answer_metrics_df(
|
||||
self,
|
||||
answers: pd.DataFrame,
|
||||
answers: DataFrame,
|
||||
simulated_top_k_reader: int = -1,
|
||||
simulated_top_k_retriever: int = -1,
|
||||
answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any",
|
||||
) -> pd.DataFrame:
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Builds a dataframe containing answer metrics (columns) per multilabel (index).
|
||||
Answer metrics are:
|
||||
@ -1335,7 +1344,7 @@ class EvaluationResult:
|
||||
}
|
||||
df_records.append(query_metrics)
|
||||
|
||||
metrics_df = pd.DataFrame.from_records(df_records, index=multilabel_ids)
|
||||
metrics_df = DataFrame.from_records(df_records, index=multilabel_ids)
|
||||
return metrics_df
|
||||
|
||||
def _get_documents_df(self):
|
||||
@ -1350,7 +1359,7 @@ class EvaluationResult:
|
||||
|
||||
def _calculate_document_metrics(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
df: DataFrame,
|
||||
simulated_top_k_retriever: int = -1,
|
||||
document_relevance_criterion: Literal[
|
||||
"document_id",
|
||||
@ -1378,7 +1387,7 @@ class EvaluationResult:
|
||||
|
||||
def _build_document_metrics_df(
|
||||
self,
|
||||
documents: pd.DataFrame,
|
||||
documents: DataFrame,
|
||||
simulated_top_k_retriever: int = -1,
|
||||
document_relevance_criterion: Literal[
|
||||
"document_id",
|
||||
@ -1391,7 +1400,7 @@ class EvaluationResult:
|
||||
"document_id_and_context_and_answer",
|
||||
"document_id_or_answer",
|
||||
] = "document_id_or_answer",
|
||||
) -> pd.DataFrame:
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Builds a dataframe containing document metrics (columns) per pair of query and gold document ids (index).
|
||||
Document metrics are:
|
||||
@ -1539,7 +1548,7 @@ class EvaluationResult:
|
||||
}
|
||||
)
|
||||
|
||||
metrics_df = pd.DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
|
||||
metrics_df = DataFrame.from_records(metrics, index=documents["multilabel_id"].unique())
|
||||
return metrics_df
|
||||
|
||||
def save(self, out_dir: Union[str, Path], **to_csv_kwargs):
|
||||
@ -1548,8 +1557,8 @@ class EvaluationResult:
|
||||
The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder.
|
||||
|
||||
:param out_dir: Path to the target folder the csvs will be saved.
|
||||
:param to_csv_kwargs: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
|
||||
This method uses different default values than pd.DataFrame.to_csv() for the following parameters:
|
||||
:param to_csv_kwargs: kwargs to be passed to DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.
|
||||
This method uses different default values than DataFrame.to_csv() for the following parameters:
|
||||
index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars)
|
||||
"""
|
||||
out_dir = out_dir if isinstance(out_dir, Path) else Path(out_dir)
|
||||
|
@ -9,6 +9,7 @@ from pathlib import Path
|
||||
from typing import Optional, Dict, Union, Tuple, List
|
||||
|
||||
import requests
|
||||
|
||||
from haystack.errors import DatasetsError
|
||||
from haystack.schema import Document
|
||||
|
||||
|
@ -6,7 +6,7 @@ import sys
|
||||
import json
|
||||
from typing import Dict, Union, Tuple, Optional, List
|
||||
import requests
|
||||
from tenacity import retry, retry_if_exception_type, wait_exponential, stop_after_attempt
|
||||
import tenacity
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
from haystack.errors import OpenAIError, OpenAIRateLimitError, OpenAIUnauthorizedError
|
||||
@ -127,10 +127,10 @@ def _openai_text_completion_tokenization_details(model_name: str):
|
||||
return tokenizer_name, max_tokens_limit
|
||||
|
||||
|
||||
@retry(
|
||||
retry=retry_if_exception_type(OpenAIRateLimitError),
|
||||
wait=wait_exponential(multiplier=OPENAI_BACKOFF),
|
||||
stop=stop_after_attempt(OPENAI_MAX_RETRIES),
|
||||
@tenacity.retry(
|
||||
retry=tenacity.retry_if_exception_type(OpenAIRateLimitError),
|
||||
wait=tenacity.wait_exponential(multiplier=OPENAI_BACKOFF),
|
||||
stop=tenacity.stop_after_attempt(OPENAI_MAX_RETRIES),
|
||||
)
|
||||
def openai_request(
|
||||
url: str,
|
||||
|
@ -7,6 +7,7 @@ import pandas as pd
|
||||
from tqdm.auto import tqdm
|
||||
import mmh3
|
||||
|
||||
from haystack import is_imported
|
||||
from haystack.schema import Document, Label, Answer
|
||||
from haystack.modeling.data_handler.processor import _read_squad_file
|
||||
|
||||
@ -14,6 +15,7 @@ from haystack.modeling.data_handler.processor import _read_squad_file
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
if is_imported("pandas") and is_imported("tqdm"):
|
||||
tqdm.pandas()
|
||||
|
||||
|
||||
|
@ -53,6 +53,7 @@ dependencies = [
|
||||
"pandas",
|
||||
"rank_bm25",
|
||||
"scikit-learn>=1.0.0", # TF-IDF, SklearnQueryClassifier and metrics
|
||||
"generalimport", # Optional imports
|
||||
|
||||
# Utils
|
||||
"dill", # pickle extension for (de-)serialization
|
||||
|
Loading…
x
Reference in New Issue
Block a user