feat: less precision in json floats (#1718)

Closes #1340.
### Summary
- add functionality to limit precision when serializing to JSON
### Testing
```
elements = partition(raw_doc.<extension>)
output_json = elements_to_json(elements)
print(output_json)
```
This commit is contained in:
Christine Straub 2023-10-13 11:06:36 -07:00 committed by GitHub
parent ad1b93dbaa
commit ef391e1a3e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 37 additions and 2 deletions

View File

@ -1,3 +1,13 @@
## 0.10.23-dev0
### Enhancements
* **Add functionality to limit precision when serializing to json** Precision for `points` is limited to 1 decimal point if coordinates["system"] == "PixelSpace" (otherwise 2 decimal points?). Precision for `detection_class_prob` is limited to 5 decimal points.
### Features
### Fixes
## 0.10.22 ## 0.10.22
### Enhancements ### Enhancements

View File

@ -1 +1 @@
__version__ = "0.10.22" # pragma: no cover __version__ = "0.10.23-dev0" # pragma: no cover

View File

@ -1,8 +1,10 @@
import csv import csv
import io import io
import json import json
from copy import deepcopy
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ( from unstructured.documents.elements import (
TYPE_TO_TEXT_ELEMENT_MAP, TYPE_TO_TEXT_ELEMENT_MAP,
CheckBox, CheckBox,
@ -77,6 +79,27 @@ def convert_to_dict(elements: List[Element]) -> List[Dict[str, Any]]:
return convert_to_isd(elements) return convert_to_isd(elements)
def _fix_metadata_field_precision(elements: List[Element]) -> List[Element]:
out_elements = []
for element in elements:
el = deepcopy(element)
if el.metadata.coordinates:
precision = 1 if isinstance(el.metadata.coordinates.system, PixelSpace) else 2
points = el.metadata.coordinates.points
rounded_points = []
for point in points:
x, y = point
rounded_point = (round(x, precision), round(y, precision))
rounded_points.append(rounded_point)
el.metadata.coordinates.points = tuple(rounded_points)
if el.metadata.detection_class_prob:
el.metadata.detection_class_prob = round(el.metadata.detection_class_prob, 5)
out_elements.append(el)
return out_elements
def elements_to_json( def elements_to_json(
elements: List[Element], elements: List[Element],
filename: Optional[str] = None, filename: Optional[str] = None,
@ -87,7 +110,9 @@ def elements_to_json(
Saves a list of elements to a JSON file if filename is specified. Saves a list of elements to a JSON file if filename is specified.
Otherwise, return the list of elements as a string. Otherwise, return the list of elements as a string.
""" """
element_dict = convert_to_dict(elements)
pre_processed_elements = _fix_metadata_field_precision(elements)
element_dict = convert_to_dict(pre_processed_elements)
if filename is not None: if filename is not None:
with open(filename, "w", encoding=encoding) as f: with open(filename, "w", encoding=encoding) as f:
json.dump(element_dict, f, indent=indent) json.dump(element_dict, f, indent=indent)