mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-01 20:33:48 +00:00
Speed up Document dataclass import (#6767)
This commit is contained in:
parent
1c76aa07bb
commit
8079501925
@ -1,11 +1,11 @@
|
|||||||
import io
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import io
|
||||||
import logging
|
import logging
|
||||||
from dataclasses import asdict, dataclass, field, fields
|
from dataclasses import asdict, dataclass, field, fields
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import numpy
|
from numpy import ndarray
|
||||||
import pandas
|
from pandas import DataFrame, read_json
|
||||||
|
|
||||||
from haystack.dataclasses.byte_stream import ByteStream
|
from haystack.dataclasses.byte_stream import ByteStream
|
||||||
|
|
||||||
@ -24,7 +24,7 @@ class _BackwardCompatible(type):
|
|||||||
"""
|
"""
|
||||||
# Move `content` to new fields depending on the type
|
# Move `content` to new fields depending on the type
|
||||||
content = kwargs.get("content")
|
content = kwargs.get("content")
|
||||||
if isinstance(content, pandas.DataFrame):
|
if isinstance(content, DataFrame):
|
||||||
kwargs["dataframe"] = content
|
kwargs["dataframe"] = content
|
||||||
del kwargs["content"]
|
del kwargs["content"]
|
||||||
|
|
||||||
@ -33,7 +33,7 @@ class _BackwardCompatible(type):
|
|||||||
del kwargs["content_type"]
|
del kwargs["content_type"]
|
||||||
|
|
||||||
# Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
|
# Embedding were stored as NumPy arrays in 1.x, so we convert it to the new type
|
||||||
if isinstance(embedding := kwargs.get("embedding"), numpy.ndarray):
|
if isinstance(embedding := kwargs.get("embedding"), ndarray):
|
||||||
kwargs["embedding"] = embedding.tolist()
|
kwargs["embedding"] = embedding.tolist()
|
||||||
|
|
||||||
# id_hash_keys is not used anymore
|
# id_hash_keys is not used anymore
|
||||||
@ -61,7 +61,7 @@ class Document(metaclass=_BackwardCompatible):
|
|||||||
|
|
||||||
id: str = field(default="")
|
id: str = field(default="")
|
||||||
content: Optional[str] = field(default=None)
|
content: Optional[str] = field(default=None)
|
||||||
dataframe: Optional[pandas.DataFrame] = field(default=None)
|
dataframe: Optional[DataFrame] = field(default=None)
|
||||||
blob: Optional[ByteStream] = field(default=None)
|
blob: Optional[ByteStream] = field(default=None)
|
||||||
meta: Dict[str, Any] = field(default_factory=dict)
|
meta: Dict[str, Any] = field(default_factory=dict)
|
||||||
score: Optional[float] = field(default=None)
|
score: Optional[float] = field(default=None)
|
||||||
@ -141,7 +141,7 @@ class Document(metaclass=_BackwardCompatible):
|
|||||||
`dataframe` and `blob` fields are converted to their original types.
|
`dataframe` and `blob` fields are converted to their original types.
|
||||||
"""
|
"""
|
||||||
if (dataframe := data.get("dataframe")) is not None:
|
if (dataframe := data.get("dataframe")) is not None:
|
||||||
data["dataframe"] = pandas.read_json(io.StringIO(dataframe))
|
data["dataframe"] = read_json(io.StringIO(dataframe))
|
||||||
if blob := data.get("blob"):
|
if blob := data.get("blob"):
|
||||||
data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
|
data["blob"] = ByteStream(data=bytes(blob["data"]), mime_type=blob["mime_type"])
|
||||||
# Store metadata for a moment while we try un-flattening allegedly flatten metadata.
|
# Store metadata for a moment while we try un-flattening allegedly flatten metadata.
|
||||||
|
7
releasenotes/notes/speedup-import-b542f7a8323ef376.yaml
Normal file
7
releasenotes/notes/speedup-import-b542f7a8323ef376.yaml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
prelude: >
|
||||||
|
enhancements:
|
||||||
|
- |
|
||||||
|
Speed up import of Document dataclass.
|
||||||
|
Importing Document was slowed down cause we were importing the whole `pandas` and `numpy` packages.
|
||||||
|
This has now been changed to import only the necessary classes and functions.
|
Loading…
x
Reference in New Issue
Block a user