mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-14 02:59:26 +00:00
34 lines
877 B
Python
34 lines
877 B
Python
import datetime
|
|
import hashlib
|
|
import json
|
|
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class PdfOutput:
|
|
path: str
|
|
text: str
|
|
total_pdf_pages: int
|
|
processed_pdf_pages: int
|
|
|
|
def mk_dolma_doc(self, **kwargs) -> str:
|
|
metadata = {
|
|
"Source-File": self.path,
|
|
"pdf-pages": self.processed_pdf_pages,
|
|
"pdf-total-pages": self.total_pdf_pages,
|
|
# Kwargs are added as extra metadata
|
|
**kwargs,
|
|
}
|
|
id_ = hashlib.sha1(self.text.encode()).hexdigest()
|
|
|
|
dolma_doc = {
|
|
"id": id_,
|
|
"text": self.text,
|
|
"source": "s2pdf",
|
|
"added": datetime.datetime.now().strftime("%Y-%m-%d"),
|
|
"created": datetime.datetime.now().strftime("%Y-%m-%d"),
|
|
"metadata": metadata,
|
|
}
|
|
|
|
return json.dumps(dolma_doc)
|