mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-10-12 16:38:40 +00:00
refactor: improve support for dataclasses (#3142)
* refactor: improve support for dataclasses * refactor: refactor class init * refactor: remove unused import * refactor: testing 3.7 diffs * refactor: checking meta where is Optional * refactor: reverting some changes on 3.7 * refactor: remove unused imports * build: manual pre-commit run * doc: run doc pre-commit manually * refactor: post initialization hack for 3.7-3.10 compat. TODO: investigate another method to improve 3.7 compatibility. * doc: force pre-commit * refactor: refactored for both Python 3.7 and 3.9 * docs: manually run pre-commit hooks * docs: run api docs manually * docs: fix wrong comment * refactor: change no type-checked test code * docs: update primitives * docs: api documentation * docs: api documentation * refactor: minor test refactoring * refactor: remova unused enumeration on test * refactor: remove unneeded dir in gitignore * refactor: exclude all private fields and change meta def * refactor: add pydantic comment * refactor : fix for mypy on Python 3.7 * refactor: revert custom init * docs: update docs to new pydoc-markdown style * Update test/nodes/test_generator.py Co-authored-by: Sara Zan <sarazanzo94@gmail.com>
This commit is contained in:
parent
1a6cbca9b6
commit
621e1af74c
@ -366,7 +366,7 @@ Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts,
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
def convert(file_path: Path,
|
def convert(file_path: Path,
|
||||||
meta: Optional[Dict[str, str]] = None,
|
meta: Optional[Dict[str, Any]] = None,
|
||||||
remove_numeric_tables: Optional[bool] = None,
|
remove_numeric_tables: Optional[bool] = None,
|
||||||
valid_languages: Optional[List[str]] = None,
|
valid_languages: Optional[List[str]] = None,
|
||||||
encoding: Optional[str] = None,
|
encoding: Optional[str] = None,
|
||||||
@ -440,7 +440,7 @@ In this case the id will be generated by using the content and the defined metad
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
def convert(file_path: Path,
|
def convert(file_path: Path,
|
||||||
meta: Optional[Dict[str, str]] = None,
|
meta: Optional[Dict[str, Any]] = None,
|
||||||
remove_numeric_tables: Optional[bool] = None,
|
remove_numeric_tables: Optional[bool] = None,
|
||||||
valid_languages: Optional[List[str]] = None,
|
valid_languages: Optional[List[str]] = None,
|
||||||
encoding: Optional[str] = None,
|
encoding: Optional[str] = None,
|
||||||
|
@ -20,7 +20,7 @@ def __init__(content: Union[str, pd.DataFrame],
|
|||||||
content_type: Literal["text", "table", "image", "audio"] = "text",
|
content_type: Literal["text", "table", "image", "audio"] = "text",
|
||||||
id: Optional[str] = None,
|
id: Optional[str] = None,
|
||||||
score: Optional[float] = None,
|
score: Optional[float] = None,
|
||||||
meta: Dict[str, Any] = None,
|
meta: Optional[Dict[str, Any]] = None,
|
||||||
embedding: Optional[np.ndarray] = None,
|
embedding: Optional[np.ndarray] = None,
|
||||||
id_hash_keys: Optional[List[str]] = None)
|
id_hash_keys: Optional[List[str]] = None)
|
||||||
```
|
```
|
||||||
@ -29,13 +29,10 @@ One of the core data classes in Haystack. It's used to represent documents / pas
|
|||||||
|
|
||||||
Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
|
Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
|
||||||
many other places that manipulate or interact with document-level data.
|
many other places that manipulate or interact with document-level data.
|
||||||
|
|
||||||
Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
|
Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
|
||||||
into smaller passages. We'll have one Document per passage in this case.
|
into smaller passages. We'll have one Document per passage in this case.
|
||||||
|
|
||||||
Each document has a unique ID. This can be supplied by the user or generated automatically.
|
Each document has a unique ID. This can be supplied by the user or generated automatically.
|
||||||
It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
|
It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
|
||||||
|
|
||||||
There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
|
There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
|
||||||
|
|
||||||
**Arguments**:
|
**Arguments**:
|
||||||
|
@ -59,7 +59,7 @@ See the up-to-date list of available models on
|
|||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
```python
|
```python
|
||||||
| docs = [Document(text="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions."
|
| docs = [Document(content="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions."
|
||||||
| "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by"
|
| "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by"
|
||||||
| "the shutoffs which were expected to last through at least midday tomorrow.")]
|
| "the shutoffs which were expected to last through at least midday tomorrow.")]
|
||||||
|
|
|
|
||||||
|
@ -618,13 +618,15 @@
|
|||||||
"Document": {
|
"Document": {
|
||||||
"title": "Document",
|
"title": "Document",
|
||||||
"required": [
|
"required": [
|
||||||
"content",
|
|
||||||
"content_type",
|
|
||||||
"id",
|
"id",
|
||||||
"meta"
|
"content"
|
||||||
],
|
],
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"title": "Id",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
"content": {
|
"content": {
|
||||||
"title": "Content",
|
"title": "Content",
|
||||||
"anyOf": [
|
"anyOf": [
|
||||||
@ -644,15 +646,13 @@
|
|||||||
"image",
|
"image",
|
||||||
"audio"
|
"audio"
|
||||||
],
|
],
|
||||||
"type": "string"
|
"type": "string",
|
||||||
},
|
"default": "text"
|
||||||
"id": {
|
|
||||||
"title": "Id",
|
|
||||||
"type": "string"
|
|
||||||
},
|
},
|
||||||
"meta": {
|
"meta": {
|
||||||
"title": "Meta",
|
"title": "Meta",
|
||||||
"type": "object"
|
"type": "object",
|
||||||
|
"default": {}
|
||||||
},
|
},
|
||||||
"score": {
|
"score": {
|
||||||
"title": "Score",
|
"title": "Score",
|
||||||
|
@ -618,13 +618,15 @@
|
|||||||
"Document": {
|
"Document": {
|
||||||
"title": "Document",
|
"title": "Document",
|
||||||
"required": [
|
"required": [
|
||||||
"content",
|
|
||||||
"content_type",
|
|
||||||
"id",
|
"id",
|
||||||
"meta"
|
"content"
|
||||||
],
|
],
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"title": "Id",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
"content": {
|
"content": {
|
||||||
"title": "Content",
|
"title": "Content",
|
||||||
"anyOf": [
|
"anyOf": [
|
||||||
@ -644,15 +646,13 @@
|
|||||||
"image",
|
"image",
|
||||||
"audio"
|
"audio"
|
||||||
],
|
],
|
||||||
"type": "string"
|
"type": "string",
|
||||||
},
|
"default": "text"
|
||||||
"id": {
|
|
||||||
"title": "Id",
|
|
||||||
"type": "string"
|
|
||||||
},
|
},
|
||||||
"meta": {
|
"meta": {
|
||||||
"title": "Meta",
|
"title": "Meta",
|
||||||
"type": "object"
|
"type": "object",
|
||||||
|
"default": {}
|
||||||
},
|
},
|
||||||
"score": {
|
"score": {
|
||||||
"title": "Score",
|
"title": "Score",
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional, Dict
|
from typing import List, Optional, Dict, Any
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
@ -74,7 +74,7 @@ class PDFToTextConverter(BaseConverter):
|
|||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_path: Path,
|
file_path: Path,
|
||||||
meta: Optional[Dict[str, str]] = None,
|
meta: Optional[Dict[str, Any]] = None,
|
||||||
remove_numeric_tables: Optional[bool] = None,
|
remove_numeric_tables: Optional[bool] = None,
|
||||||
valid_languages: Optional[List[str]] = None,
|
valid_languages: Optional[List[str]] = None,
|
||||||
encoding: Optional[str] = None,
|
encoding: Optional[str] = None,
|
||||||
@ -212,7 +212,7 @@ class PDFToTextOCRConverter(BaseConverter):
|
|||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
file_path: Path,
|
file_path: Path,
|
||||||
meta: Optional[Dict[str, str]] = None,
|
meta: Optional[Dict[str, Any]] = None,
|
||||||
remove_numeric_tables: Optional[bool] = None,
|
remove_numeric_tables: Optional[bool] = None,
|
||||||
valid_languages: Optional[List[str]] = None,
|
valid_languages: Optional[List[str]] = None,
|
||||||
encoding: Optional[str] = None,
|
encoding: Optional[str] = None,
|
||||||
|
@ -28,7 +28,7 @@ class TransformersSummarizer(BaseSummarizer):
|
|||||||
**Example**
|
**Example**
|
||||||
|
|
||||||
```python
|
```python
|
||||||
| docs = [Document(text="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions."
|
| docs = [Document(content="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions."
|
||||||
| "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by"
|
| "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by"
|
||||||
| "the shutoffs which were expected to last through at least midday tomorrow.")]
|
| "the shutoffs which were expected to last through at least midday tomorrow.")]
|
||||||
|
|
|
|
||||||
|
@ -2,11 +2,10 @@ from __future__ import annotations
|
|||||||
import csv
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
import typing
|
|
||||||
from typing import Any, Optional, Dict, List, Union
|
from typing import Any, Optional, Dict, List, Union
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from typing import Literal
|
from typing import Literal # type: ignore
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from typing_extensions import Literal # type: ignore
|
from typing_extensions import Literal # type: ignore
|
||||||
|
|
||||||
@ -16,21 +15,18 @@ import logging
|
|||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
import ast
|
import ast
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict, InitVar
|
||||||
|
|
||||||
import mmh3
|
import mmh3
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from pydantic import BaseConfig
|
from pydantic import BaseConfig, Field
|
||||||
from pydantic.json import pydantic_encoder
|
from pydantic.json import pydantic_encoder
|
||||||
|
|
||||||
if not typing.TYPE_CHECKING:
|
# We are using Pydantic dataclasses instead of vanilla Python's
|
||||||
# We are using Pydantic dataclasses instead of vanilla Python's
|
# See #1598 for the reasons behind this choice & performance considerations
|
||||||
# See #1598 for the reasons behind this choice & performance considerations
|
from pydantic.dataclasses import dataclass
|
||||||
from pydantic.dataclasses import dataclass
|
|
||||||
else:
|
|
||||||
from dataclasses import dataclass # type: ignore # pylint: disable=ungrouped-imports
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -41,12 +37,13 @@ BaseConfig.arbitrary_types_allowed = True
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Document:
|
class Document:
|
||||||
content: Union[str, pd.DataFrame]
|
|
||||||
content_type: Literal["text", "table", "image", "audio"]
|
|
||||||
id: str
|
id: str
|
||||||
meta: Dict[str, Any]
|
content: Union[str, pd.DataFrame]
|
||||||
|
content_type: Literal["text", "table", "image", "audio"] = Field(default="text")
|
||||||
|
meta: Dict[str, Any] = Field(default={})
|
||||||
score: Optional[float] = None
|
score: Optional[float] = None
|
||||||
embedding: Optional[np.ndarray] = None
|
embedding: Optional[np.ndarray] = None
|
||||||
|
id_hash_keys: InitVar[Optional[List[str]]] = None
|
||||||
|
|
||||||
# We use a custom init here as we want some custom logic. The annotations above are however still needed in order
|
# We use a custom init here as we want some custom logic. The annotations above are however still needed in order
|
||||||
# to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
|
# to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
|
||||||
@ -58,7 +55,7 @@ class Document:
|
|||||||
content_type: Literal["text", "table", "image", "audio"] = "text",
|
content_type: Literal["text", "table", "image", "audio"] = "text",
|
||||||
id: Optional[str] = None,
|
id: Optional[str] = None,
|
||||||
score: Optional[float] = None,
|
score: Optional[float] = None,
|
||||||
meta: Dict[str, Any] = None,
|
meta: Optional[Dict[str, Any]] = None,
|
||||||
embedding: Optional[np.ndarray] = None,
|
embedding: Optional[np.ndarray] = None,
|
||||||
id_hash_keys: Optional[List[str]] = None,
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
):
|
):
|
||||||
@ -66,15 +63,11 @@ class Document:
|
|||||||
One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
|
One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
|
||||||
Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
|
Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
|
||||||
many other places that manipulate or interact with document-level data.
|
many other places that manipulate or interact with document-level data.
|
||||||
|
|
||||||
Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
|
Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
|
||||||
into smaller passages. We'll have one Document per passage in this case.
|
into smaller passages. We'll have one Document per passage in this case.
|
||||||
|
|
||||||
Each document has a unique ID. This can be supplied by the user or generated automatically.
|
Each document has a unique ID. This can be supplied by the user or generated automatically.
|
||||||
It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
|
It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
|
||||||
|
|
||||||
There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
|
There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
|
||||||
|
|
||||||
:param content: Content of the document. For most cases, this will be text, but it can be a table or image.
|
:param content: Content of the document. For most cases, this will be text, but it can be a table or image.
|
||||||
:param content_type: One of "text", "table" or "image". Haystack components can use this to adjust their
|
:param content_type: One of "text", "table" or "image". Haystack components can use this to adjust their
|
||||||
handling of Documents and check compatibility.
|
handling of Documents and check compatibility.
|
||||||
@ -154,6 +147,9 @@ class Document:
|
|||||||
inv_field_map = {v: k for k, v in field_map.items()}
|
inv_field_map = {v: k for k, v in field_map.items()}
|
||||||
_doc: Dict[str, str] = {}
|
_doc: Dict[str, str] = {}
|
||||||
for k, v in self.__dict__.items():
|
for k, v in self.__dict__.items():
|
||||||
|
# Exclude internal fields (Pydantic, ...) fields from the conversion process
|
||||||
|
if k.startswith("__"):
|
||||||
|
continue
|
||||||
if k == "content":
|
if k == "content":
|
||||||
# Convert pd.DataFrame to list of rows for serialization
|
# Convert pd.DataFrame to list of rows for serialization
|
||||||
if self.content_type == "table" and isinstance(self.content, pd.DataFrame):
|
if self.content_type == "table" and isinstance(self.content, pd.DataFrame):
|
||||||
@ -184,6 +180,9 @@ class Document:
|
|||||||
_doc["meta"] = {}
|
_doc["meta"] = {}
|
||||||
# copy additional fields into "meta"
|
# copy additional fields into "meta"
|
||||||
for k, v in _doc.items():
|
for k, v in _doc.items():
|
||||||
|
# Exclude internal fields (Pydantic, ...) fields from the conversion process
|
||||||
|
if k.startswith("__"):
|
||||||
|
continue
|
||||||
if k not in init_args and k not in field_map:
|
if k not in init_args and k not in field_map:
|
||||||
_doc["meta"][k] = v
|
_doc["meta"][k] = v
|
||||||
# remove additional fields from top level
|
# remove additional fields from top level
|
||||||
@ -615,6 +614,8 @@ class MultiLabel:
|
|||||||
contexts: List[str]
|
contexts: List[str]
|
||||||
offsets_in_contexts: List[Dict]
|
offsets_in_contexts: List[Dict]
|
||||||
offsets_in_documents: List[Dict]
|
offsets_in_documents: List[Dict]
|
||||||
|
drop_negative_labels: InitVar[bool] = False
|
||||||
|
drop_no_answer: InitVar[bool] = False
|
||||||
|
|
||||||
def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answers=False, **kwargs):
|
def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answers=False, **kwargs):
|
||||||
"""
|
"""
|
||||||
@ -676,6 +677,7 @@ class MultiLabel:
|
|||||||
# as separate no_answer labels, and thus with document.id but without answer.document_id.
|
# as separate no_answer labels, and thus with document.id but without answer.document_id.
|
||||||
# If we do not exclude them from document_ids this would be problematic for retriever evaluation as they do not contain the answer.
|
# If we do not exclude them from document_ids this would be problematic for retriever evaluation as they do not contain the answer.
|
||||||
# Hence, we exclude them here as well.
|
# Hence, we exclude them here as well.
|
||||||
|
|
||||||
self.document_ids = [l.document.id for l in self.labels if not l.no_answer]
|
self.document_ids = [l.document.id for l in self.labels if not l.no_answer]
|
||||||
self.contexts = [l.document.content for l in self.labels if not l.no_answer]
|
self.contexts = [l.document.content for l in self.labels if not l.no_answer]
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ dependencies = [
|
|||||||
"importlib-metadata; python_version < '3.8'",
|
"importlib-metadata; python_version < '3.8'",
|
||||||
"torch>1.9,<1.13",
|
"torch>1.9,<1.13",
|
||||||
"requests",
|
"requests",
|
||||||
"pydantic==1.9.2",
|
"pydantic",
|
||||||
"transformers==4.21.2",
|
"transformers==4.21.2",
|
||||||
"nltk",
|
"nltk",
|
||||||
"pandas",
|
"pandas",
|
||||||
|
@ -2,7 +2,6 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from haystack.schema import Document
|
from haystack.schema import Document
|
||||||
@ -64,8 +63,8 @@ def test_generator_pipeline(document_store, retriever, rag_generator, docs_with_
|
|||||||
def test_lfqa_pipeline(document_store, retriever, lfqa_generator, docs_with_true_emb):
|
def test_lfqa_pipeline(document_store, retriever, lfqa_generator, docs_with_true_emb):
|
||||||
# reuse existing DOCS but regenerate embeddings with retribert
|
# reuse existing DOCS but regenerate embeddings with retribert
|
||||||
docs: List[Document] = []
|
docs: List[Document] = []
|
||||||
for idx, d in enumerate(docs_with_true_emb):
|
for d in docs_with_true_emb:
|
||||||
docs.append(Document(d.content, str(idx)))
|
docs.append(Document(content=d.content))
|
||||||
document_store.write_documents(docs)
|
document_store.write_documents(docs)
|
||||||
document_store.update_embeddings(retriever)
|
document_store.update_embeddings(retriever)
|
||||||
query = "Tell me about Berlin?"
|
query = "Tell me about Berlin?"
|
||||||
@ -84,8 +83,8 @@ def test_lfqa_pipeline(document_store, retriever, lfqa_generator, docs_with_true
|
|||||||
def test_lfqa_pipeline_unknown_converter(document_store, retriever, docs_with_true_emb):
|
def test_lfqa_pipeline_unknown_converter(document_store, retriever, docs_with_true_emb):
|
||||||
# reuse existing DOCS but regenerate embeddings with retribert
|
# reuse existing DOCS but regenerate embeddings with retribert
|
||||||
docs: List[Document] = []
|
docs: List[Document] = []
|
||||||
for idx, d in enumerate(docs_with_true_emb):
|
for d in docs_with_true_emb:
|
||||||
docs.append(Document(d.content, str(idx)))
|
docs.append(Document(content=d.content))
|
||||||
document_store.write_documents(docs)
|
document_store.write_documents(docs)
|
||||||
document_store.update_embeddings(retriever)
|
document_store.update_embeddings(retriever)
|
||||||
seq2seq = Seq2SeqGenerator(model_name_or_path="patrickvonplaten/t5-tiny-random")
|
seq2seq = Seq2SeqGenerator(model_name_or_path="patrickvonplaten/t5-tiny-random")
|
||||||
@ -106,8 +105,8 @@ def test_lfqa_pipeline_unknown_converter(document_store, retriever, docs_with_tr
|
|||||||
def test_lfqa_pipeline_invalid_converter(document_store, retriever, docs_with_true_emb):
|
def test_lfqa_pipeline_invalid_converter(document_store, retriever, docs_with_true_emb):
|
||||||
# reuse existing DOCS but regenerate embeddings with retribert
|
# reuse existing DOCS but regenerate embeddings with retribert
|
||||||
docs: List[Document] = []
|
docs: List[Document] = []
|
||||||
for idx, d in enumerate(docs_with_true_emb):
|
for d in docs_with_true_emb:
|
||||||
docs.append(Document(d.content, str(idx)))
|
docs.append(Document(content=d.content))
|
||||||
document_store.write_documents(docs)
|
document_store.write_documents(docs)
|
||||||
document_store.update_embeddings(retriever)
|
document_store.update_embeddings(retriever)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user