olmocr/pdelfin/datatypes.py

34 lines
877 B
Python
Raw Normal View History

2024-09-17 15:16:58 +00:00
import datetime
2024-09-18 22:52:42 +00:00
import hashlib
import json
2024-09-17 15:16:58 +00:00
from dataclasses import dataclass
@dataclass(frozen=True)
class PdfOutput:
path: str
text: str
total_pdf_pages: int
processed_pdf_pages: int
def mk_dolma_doc(self, **kwargs) -> str:
metadata = {
"Source-File": self.path,
"pdf-pages": self.processed_pdf_pages,
"pdf-total-pages": self.total_pdf_pages,
# Kwargs are added as extra metadata
**kwargs,
}
id_ = hashlib.sha1(self.text.encode()).hexdigest()
dolma_doc = {
"id": id_,
"text": self.text,
"source": "s2pdf",
"added": datetime.datetime.now().strftime("%Y-%m-%d"),
"created": datetime.datetime.now().strftime("%Y-%m-%d"),
"metadata": metadata,
}
2024-09-18 22:52:42 +00:00
return json.dumps(dolma_doc)