mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-31 20:03:38 +00:00
Speed up Document dataclass import (#6767)
This commit is contained in:
parent
1c76aa07bb
commit
8079501925
@ -1,11 +1,11 @@
|
||||
import io
|
||||
import hashlib
|
||||
import io
|
||||
import logging
|
||||
from dataclasses import asdict, dataclass, field, fields
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import numpy
|
||||
import pandas
|
||||
from numpy import ndarray
|
||||
from pandas import DataFrame, read_json
|
||||
|
||||
from haystack.dataclasses.byte_stream import ByteStream
|
||||
|
||||
@ -24,7 +24,7 @@ class _BackwardCompatible(type):
|
||||
"""
|
||||
# Move `content` to new fields depending on the type
|
||||
content = kwargs.get("content")
|
||||
if isinstance(content, pandas.DataFrame):
|
||||
if isinstance(content, DataFrame):
|
||||
kwargs["dataframe"] = content
|
||||
del kwargs["content"]
|
||||
|
||||
@ -33,7 +33,7 @@ class _BackwardCompatible(type):
|
||||
del kwargs["content_type"]
|
||||
|
||||
# Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
|
||||
if isinstance(embedding := kwargs.get("embedding"), numpy.ndarray):
|
||||
if isinstance(embedding := kwargs.get("embedding"), ndarray):
|
||||
kwargs["embedding"] = embedding.tolist()
|
||||
|
||||
# id_hash_keys is not used anymore
|
||||
@ -61,7 +61,7 @@ class Document(metaclass=_BackwardCompatible):
|
||||
|
||||
id: str = field(default="")
|
||||
content: Optional[str] = field(default=None)
|
||||
dataframe: Optional[pandas.DataFrame] = field(default=None)
|
||||
dataframe: Optional[DataFrame] = field(default=None)
|
||||
blob: Optional[ByteStream] = field(default=None)
|
||||
meta: Dict[str, Any] = field(default_factory=dict)
|
||||
score: Optional[float] = field(default=None)
|
||||
@ -141,7 +141,7 @@ class Document(metaclass=_BackwardCompatible):
|
||||
`dataframe` and `blob` fields are converted to their original types.
|
||||
"""
|
||||
if (dataframe := data.get("dataframe")) is not None:
|
||||
data["dataframe"] = pandas.read_json(io.StringIO(dataframe))
|
||||
data["dataframe"] = read_json(io.StringIO(dataframe))
|
||||
if blob := data.get("blob"):
|
||||
data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
|
||||
# Store metadata for a moment while we try un-flattening allegedly flatten metadata.
|
||||
|
7
releasenotes/notes/speedup-import-b542f7a8323ef376.yaml
Normal file
7
releasenotes/notes/speedup-import-b542f7a8323ef376.yaml
Normal file
@ -0,0 +1,7 @@
|
||||
---
|
||||
prelude: >
|
||||
enhancements:
|
||||
- |
|
||||
Speed up import of Document dataclass.
|
||||
Importing Document was slowed down cause we were importing the whole `pandas` and `numpy` packages.
|
||||
This has now been changed to import only the necessary classes and functions.
|
Loading…
x
Reference in New Issue
Block a user