Update all docs at once

This commit is contained in:
Jake Poznanski 2024-10-28 15:06:29 +00:00
parent 062abff25c
commit a3e7654190
2 changed files with 11 additions and 6 deletions

1
.gitignore vendored
View File

@ -6,6 +6,7 @@ dolma_previews/*
s2_previews/* s2_previews/*
gnarly_previews/* gnarly_previews/*
s2orc_previews/* s2orc_previews/*
s2orc_previews_3200/*
/*.html /*.html

View File

@ -206,12 +206,18 @@ class DatabaseManager:
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
print(f"PDF with s3_path '{s3_path}' already exists.") print(f"PDF with s3_path '{s3_path}' already exists.")
def update_pdf_status(self, s3_path: str, new_status: str) -> None: def update_pdf_statuses(self, status_updates: dict[str, str]) -> None:
self.cursor.execute(""" """
Update the status of multiple PDFs in the database.
:param status_updates: A dictionary where each key is an s3_path (str) and
each value is the new status (str) for that PDF.
"""
self.cursor.executemany("""
UPDATE pdfs UPDATE pdfs
SET status = ? SET status = ?
WHERE s3_path = ? WHERE s3_path = ?
""", (new_status, s3_path)) """, [(new_status, s3_path) for s3_path, new_status in status_updates.items()])
self.conn.commit() self.conn.commit()
def get_pdf(self, s3_path: str) -> Optional[PDFRecord]: def get_pdf(self, s3_path: str) -> Optional[PDFRecord]:
@ -569,9 +575,7 @@ def build_dolma_doc(s3_workspace: str, pdf: DatabaseManager.PDFRecord) -> Option
def mark_pdfs_done(s3_workspace: str, dolma_docs: list[dict]): def mark_pdfs_done(s3_workspace: str, dolma_docs: list[dict]):
db = DatabaseManager(s3_workspace, skip_init=True) db = DatabaseManager(s3_workspace, skip_init=True)
db.update_pdf_statuses({doc["metadata"]["Source-File"]: "completed" for doc in dolma_docs})
for doc in dolma_docs:
db.update_pdf_status(doc["metadata"]["Source-File"], "completed")
def get_current_round(s3_workspace: str) -> int: def get_current_round(s3_workspace: str) -> int:
path = s3_workspace[5:] path = s3_workspace[5:]