mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-07 05:39:49 +00:00
Update all docs at once
This commit is contained in:
parent
062abff25c
commit
a3e7654190
1
.gitignore
vendored
1
.gitignore
vendored
@ -6,6 +6,7 @@ dolma_previews/*
|
|||||||
s2_previews/*
|
s2_previews/*
|
||||||
gnarly_previews/*
|
gnarly_previews/*
|
||||||
s2orc_previews/*
|
s2orc_previews/*
|
||||||
|
s2orc_previews_3200/*
|
||||||
/*.html
|
/*.html
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -206,12 +206,18 @@ class DatabaseManager:
|
|||||||
except sqlite3.IntegrityError:
|
except sqlite3.IntegrityError:
|
||||||
print(f"PDF with s3_path '{s3_path}' already exists.")
|
print(f"PDF with s3_path '{s3_path}' already exists.")
|
||||||
|
|
||||||
def update_pdf_status(self, s3_path: str, new_status: str) -> None:
|
def update_pdf_statuses(self, status_updates: dict[str, str]) -> None:
|
||||||
self.cursor.execute("""
|
"""
|
||||||
|
Update the status of multiple PDFs in the database.
|
||||||
|
|
||||||
|
:param status_updates: A dictionary where each key is an s3_path (str) and
|
||||||
|
each value is the new status (str) for that PDF.
|
||||||
|
"""
|
||||||
|
self.cursor.executemany("""
|
||||||
UPDATE pdfs
|
UPDATE pdfs
|
||||||
SET status = ?
|
SET status = ?
|
||||||
WHERE s3_path = ?
|
WHERE s3_path = ?
|
||||||
""", (new_status, s3_path))
|
""", [(new_status, s3_path) for s3_path, new_status in status_updates.items()])
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
|
|
||||||
def get_pdf(self, s3_path: str) -> Optional[PDFRecord]:
|
def get_pdf(self, s3_path: str) -> Optional[PDFRecord]:
|
||||||
@ -569,9 +575,7 @@ def build_dolma_doc(s3_workspace: str, pdf: DatabaseManager.PDFRecord) -> Option
|
|||||||
|
|
||||||
def mark_pdfs_done(s3_workspace: str, dolma_docs: list[dict]):
|
def mark_pdfs_done(s3_workspace: str, dolma_docs: list[dict]):
|
||||||
db = DatabaseManager(s3_workspace, skip_init=True)
|
db = DatabaseManager(s3_workspace, skip_init=True)
|
||||||
|
db.update_pdf_statuses({doc["metadata"]["Source-File"]: "completed" for doc in dolma_docs})
|
||||||
for doc in dolma_docs:
|
|
||||||
db.update_pdf_status(doc["metadata"]["Source-File"], "completed")
|
|
||||||
|
|
||||||
def get_current_round(s3_workspace: str) -> int:
|
def get_current_round(s3_workspace: str) -> int:
|
||||||
path = s3_workspace[5:]
|
path = s3_workspace[5:]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user