refactor: improve support for dataclasses (#3142)

* refactor: improve support for dataclasses

* refactor: refactor class init

* refactor: remove unused import

* refactor: testing 3.7 diffs

* refactor: checking meta where is Optional

* refactor: reverting some changes on 3.7

* refactor: remove unused imports

* build: manual pre-commit run

* doc: run doc pre-commit manually

* refactor: post initialization hack for 3.7-3.10 compat.

TODO: investigate another method to improve 3.7 compatibility.

* doc: force pre-commit

* refactor: refactored for both Python 3.7 and 3.9

* docs: manually run pre-commit hooks

* docs: run api docs manually

* docs: fix wrong comment

* refactor: change no type-checked test code

* docs: update primitives

* docs: api documentation

* docs: api documentation

* refactor: minor test refactoring

* refactor: remova unused enumeration on test

* refactor: remove unneeded dir in gitignore

* refactor: exclude all private fields and change meta def

* refactor: add pydantic comment

* refactor : fix for mypy on Python 3.7

* refactor: revert custom init

* docs: update docs to new pydoc-markdown style

* Update test/nodes/test_generator.py

Co-authored-by: Sara Zan <sarazanzo94@gmail.com>
This commit is contained in:
Daniel Bichuetti 2022-09-09 06:31:37 -03:00 committed by GitHub
parent 1a6cbca9b6
commit 621e1af74c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 53 additions and 55 deletions

View File

@ -366,7 +366,7 @@ Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts,
```python ```python
def convert(file_path: Path, def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None, meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None, remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None, valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None, encoding: Optional[str] = None,
@ -440,7 +440,7 @@ In this case the id will be generated by using the content and the defined metad
```python ```python
def convert(file_path: Path, def convert(file_path: Path,
meta: Optional[Dict[str, str]] = None, meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None, remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None, valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None, encoding: Optional[str] = None,

View File

@ -20,7 +20,7 @@ def __init__(content: Union[str, pd.DataFrame],
content_type: Literal["text", "table", "image", "audio"] = "text", content_type: Literal["text", "table", "image", "audio"] = "text",
id: Optional[str] = None, id: Optional[str] = None,
score: Optional[float] = None, score: Optional[float] = None,
meta: Dict[str, Any] = None, meta: Optional[Dict[str, Any]] = None,
embedding: Optional[np.ndarray] = None, embedding: Optional[np.ndarray] = None,
id_hash_keys: Optional[List[str]] = None) id_hash_keys: Optional[List[str]] = None)
``` ```
@ -29,13 +29,10 @@ One of the core data classes in Haystack. It's used to represent documents / pas
Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
many other places that manipulate or interact with document-level data. many other places that manipulate or interact with document-level data.
Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
into smaller passages. We'll have one Document per passage in this case. into smaller passages. We'll have one Document per passage in this case.
Each document has a unique ID. This can be supplied by the user or generated automatically. Each document has a unique ID. This can be supplied by the user or generated automatically.
It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels) It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`. There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
**Arguments**: **Arguments**:

View File

@ -59,7 +59,7 @@ See the up-to-date list of available models on
**Example** **Example**
```python ```python
| docs = [Document(text="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions." | docs = [Document(content="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions."
| "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by" | "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by"
| "the shutoffs which were expected to last through at least midday tomorrow.")] | "the shutoffs which were expected to last through at least midday tomorrow.")]
| |

View File

@ -618,13 +618,15 @@
"Document": { "Document": {
"title": "Document", "title": "Document",
"required": [ "required": [
"content",
"content_type",
"id", "id",
"meta" "content"
], ],
"type": "object", "type": "object",
"properties": { "properties": {
"id": {
"title": "Id",
"type": "string"
},
"content": { "content": {
"title": "Content", "title": "Content",
"anyOf": [ "anyOf": [
@ -644,15 +646,13 @@
"image", "image",
"audio" "audio"
], ],
"type": "string" "type": "string",
}, "default": "text"
"id": {
"title": "Id",
"type": "string"
}, },
"meta": { "meta": {
"title": "Meta", "title": "Meta",
"type": "object" "type": "object",
"default": {}
}, },
"score": { "score": {
"title": "Score", "title": "Score",

View File

@ -618,13 +618,15 @@
"Document": { "Document": {
"title": "Document", "title": "Document",
"required": [ "required": [
"content",
"content_type",
"id", "id",
"meta" "content"
], ],
"type": "object", "type": "object",
"properties": { "properties": {
"id": {
"title": "Id",
"type": "string"
},
"content": { "content": {
"title": "Content", "title": "Content",
"anyOf": [ "anyOf": [
@ -644,15 +646,13 @@
"image", "image",
"audio" "audio"
], ],
"type": "string" "type": "string",
}, "default": "text"
"id": {
"title": "Id",
"type": "string"
}, },
"meta": { "meta": {
"title": "Meta", "title": "Meta",
"type": "object" "type": "object",
"default": {}
}, },
"score": { "score": {
"title": "Score", "title": "Score",

View File

@ -1,4 +1,4 @@
from typing import List, Optional, Dict from typing import List, Optional, Dict, Any
import os import os
import logging import logging
@ -74,7 +74,7 @@ class PDFToTextConverter(BaseConverter):
def convert( def convert(
self, self,
file_path: Path, file_path: Path,
meta: Optional[Dict[str, str]] = None, meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None, remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None, valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None, encoding: Optional[str] = None,
@ -212,7 +212,7 @@ class PDFToTextOCRConverter(BaseConverter):
def convert( def convert(
self, self,
file_path: Path, file_path: Path,
meta: Optional[Dict[str, str]] = None, meta: Optional[Dict[str, Any]] = None,
remove_numeric_tables: Optional[bool] = None, remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None, valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = None, encoding: Optional[str] = None,

View File

@ -28,7 +28,7 @@ class TransformersSummarizer(BaseSummarizer):
**Example** **Example**
```python ```python
| docs = [Document(text="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions." | docs = [Document(content="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions."
| "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by" | "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by"
| "the shutoffs which were expected to last through at least midday tomorrow.")] | "the shutoffs which were expected to last through at least midday tomorrow.")]
| |

View File

@ -2,11 +2,10 @@ from __future__ import annotations
import csv import csv
import hashlib import hashlib
import typing
from typing import Any, Optional, Dict, List, Union from typing import Any, Optional, Dict, List, Union
try: try:
from typing import Literal from typing import Literal # type: ignore
except ImportError: except ImportError:
from typing_extensions import Literal # type: ignore from typing_extensions import Literal # type: ignore
@ -16,21 +15,18 @@ import logging
import time import time
import json import json
import ast import ast
from dataclasses import asdict from dataclasses import asdict, InitVar
import mmh3 import mmh3
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pydantic import BaseConfig from pydantic import BaseConfig, Field
from pydantic.json import pydantic_encoder from pydantic.json import pydantic_encoder
if not typing.TYPE_CHECKING: # We are using Pydantic dataclasses instead of vanilla Python's
# We are using Pydantic dataclasses instead of vanilla Python's # See #1598 for the reasons behind this choice & performance considerations
# See #1598 for the reasons behind this choice & performance considerations from pydantic.dataclasses import dataclass
from pydantic.dataclasses import dataclass
else:
from dataclasses import dataclass # type: ignore # pylint: disable=ungrouped-imports
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -41,12 +37,13 @@ BaseConfig.arbitrary_types_allowed = True
@dataclass @dataclass
class Document: class Document:
content: Union[str, pd.DataFrame]
content_type: Literal["text", "table", "image", "audio"]
id: str id: str
meta: Dict[str, Any] content: Union[str, pd.DataFrame]
content_type: Literal["text", "table", "image", "audio"] = Field(default="text")
meta: Dict[str, Any] = Field(default={})
score: Optional[float] = None score: Optional[float] = None
embedding: Optional[np.ndarray] = None embedding: Optional[np.ndarray] = None
id_hash_keys: InitVar[Optional[List[str]]] = None
# We use a custom init here as we want some custom logic. The annotations above are however still needed in order # We use a custom init here as we want some custom logic. The annotations above are however still needed in order
# to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method # to use some dataclass magic like "asdict()". See https://www.python.org/dev/peps/pep-0557/#custom-init-method
@ -58,7 +55,7 @@ class Document:
content_type: Literal["text", "table", "image", "audio"] = "text", content_type: Literal["text", "table", "image", "audio"] = "text",
id: Optional[str] = None, id: Optional[str] = None,
score: Optional[float] = None, score: Optional[float] = None,
meta: Dict[str, Any] = None, meta: Optional[Dict[str, Any]] = None,
embedding: Optional[np.ndarray] = None, embedding: Optional[np.ndarray] = None,
id_hash_keys: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None,
): ):
@ -66,15 +63,11 @@ class Document:
One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack.
Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in
many other places that manipulate or interact with document-level data. many other places that manipulate or interact with document-level data.
Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text
into smaller passages. We'll have one Document per passage in this case. into smaller passages. We'll have one Document per passage in this case.
Each document has a unique ID. This can be supplied by the user or generated automatically. Each document has a unique ID. This can be supplied by the user or generated automatically.
It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels) It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels)
There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`. There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`.
:param content: Content of the document. For most cases, this will be text, but it can be a table or image. :param content: Content of the document. For most cases, this will be text, but it can be a table or image.
:param content_type: One of "text", "table" or "image". Haystack components can use this to adjust their :param content_type: One of "text", "table" or "image". Haystack components can use this to adjust their
handling of Documents and check compatibility. handling of Documents and check compatibility.
@ -154,6 +147,9 @@ class Document:
inv_field_map = {v: k for k, v in field_map.items()} inv_field_map = {v: k for k, v in field_map.items()}
_doc: Dict[str, str] = {} _doc: Dict[str, str] = {}
for k, v in self.__dict__.items(): for k, v in self.__dict__.items():
# Exclude internal fields (Pydantic, ...) fields from the conversion process
if k.startswith("__"):
continue
if k == "content": if k == "content":
# Convert pd.DataFrame to list of rows for serialization # Convert pd.DataFrame to list of rows for serialization
if self.content_type == "table" and isinstance(self.content, pd.DataFrame): if self.content_type == "table" and isinstance(self.content, pd.DataFrame):
@ -184,6 +180,9 @@ class Document:
_doc["meta"] = {} _doc["meta"] = {}
# copy additional fields into "meta" # copy additional fields into "meta"
for k, v in _doc.items(): for k, v in _doc.items():
# Exclude internal fields (Pydantic, ...) fields from the conversion process
if k.startswith("__"):
continue
if k not in init_args and k not in field_map: if k not in init_args and k not in field_map:
_doc["meta"][k] = v _doc["meta"][k] = v
# remove additional fields from top level # remove additional fields from top level
@ -615,6 +614,8 @@ class MultiLabel:
contexts: List[str] contexts: List[str]
offsets_in_contexts: List[Dict] offsets_in_contexts: List[Dict]
offsets_in_documents: List[Dict] offsets_in_documents: List[Dict]
drop_negative_labels: InitVar[bool] = False
drop_no_answer: InitVar[bool] = False
def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answers=False, **kwargs): def __init__(self, labels: List[Label], drop_negative_labels=False, drop_no_answers=False, **kwargs):
""" """
@ -676,6 +677,7 @@ class MultiLabel:
# as separate no_answer labels, and thus with document.id but without answer.document_id. # as separate no_answer labels, and thus with document.id but without answer.document_id.
# If we do not exclude them from document_ids this would be problematic for retriever evaluation as they do not contain the answer. # If we do not exclude them from document_ids this would be problematic for retriever evaluation as they do not contain the answer.
# Hence, we exclude them here as well. # Hence, we exclude them here as well.
self.document_ids = [l.document.id for l in self.labels if not l.no_answer] self.document_ids = [l.document.id for l in self.labels if not l.no_answer]
self.contexts = [l.document.content for l in self.labels if not l.no_answer] self.contexts = [l.document.content for l in self.labels if not l.no_answer]

View File

@ -50,7 +50,7 @@ dependencies = [
"importlib-metadata; python_version < '3.8'", "importlib-metadata; python_version < '3.8'",
"torch>1.9,<1.13", "torch>1.9,<1.13",
"requests", "requests",
"pydantic==1.9.2", "pydantic",
"transformers==4.21.2", "transformers==4.21.2",
"nltk", "nltk",
"pandas", "pandas",

View File

@ -2,7 +2,6 @@ import os
import sys import sys
from typing import List from typing import List
import numpy as np
import pytest import pytest
from haystack.schema import Document from haystack.schema import Document
@ -64,8 +63,8 @@ def test_generator_pipeline(document_store, retriever, rag_generator, docs_with_
def test_lfqa_pipeline(document_store, retriever, lfqa_generator, docs_with_true_emb): def test_lfqa_pipeline(document_store, retriever, lfqa_generator, docs_with_true_emb):
# reuse existing DOCS but regenerate embeddings with retribert # reuse existing DOCS but regenerate embeddings with retribert
docs: List[Document] = [] docs: List[Document] = []
for idx, d in enumerate(docs_with_true_emb): for d in docs_with_true_emb:
docs.append(Document(d.content, str(idx))) docs.append(Document(content=d.content))
document_store.write_documents(docs) document_store.write_documents(docs)
document_store.update_embeddings(retriever) document_store.update_embeddings(retriever)
query = "Tell me about Berlin?" query = "Tell me about Berlin?"
@ -84,8 +83,8 @@ def test_lfqa_pipeline(document_store, retriever, lfqa_generator, docs_with_true
def test_lfqa_pipeline_unknown_converter(document_store, retriever, docs_with_true_emb): def test_lfqa_pipeline_unknown_converter(document_store, retriever, docs_with_true_emb):
# reuse existing DOCS but regenerate embeddings with retribert # reuse existing DOCS but regenerate embeddings with retribert
docs: List[Document] = [] docs: List[Document] = []
for idx, d in enumerate(docs_with_true_emb): for d in docs_with_true_emb:
docs.append(Document(d.content, str(idx))) docs.append(Document(content=d.content))
document_store.write_documents(docs) document_store.write_documents(docs)
document_store.update_embeddings(retriever) document_store.update_embeddings(retriever)
seq2seq = Seq2SeqGenerator(model_name_or_path="patrickvonplaten/t5-tiny-random") seq2seq = Seq2SeqGenerator(model_name_or_path="patrickvonplaten/t5-tiny-random")
@ -106,8 +105,8 @@ def test_lfqa_pipeline_unknown_converter(document_store, retriever, docs_with_tr
def test_lfqa_pipeline_invalid_converter(document_store, retriever, docs_with_true_emb): def test_lfqa_pipeline_invalid_converter(document_store, retriever, docs_with_true_emb):
# reuse existing DOCS but regenerate embeddings with retribert # reuse existing DOCS but regenerate embeddings with retribert
docs: List[Document] = [] docs: List[Document] = []
for idx, d in enumerate(docs_with_true_emb): for d in docs_with_true_emb:
docs.append(Document(d.content, str(idx))) docs.append(Document(content=d.content))
document_store.write_documents(docs) document_store.write_documents(docs)
document_store.update_embeddings(retriever) document_store.update_embeddings(retriever)