mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-11 16:22:29 +00:00
Formatting, fixes to annotation tool
This commit is contained in:
parent
a74800f528
commit
a13a50143a
@ -1,6 +1,7 @@
|
|||||||
def build_basic_prompt() -> str:
|
def build_basic_prompt() -> str:
|
||||||
return "Please provide a natural, plain text representation of the document, formatted in Markdown. For mathematical expressions, use LaTeX notation with \( and \) for inline equations and \[ and \] for display equations. Convert any tables into Markdown format."
|
return "Please provide a natural, plain text representation of the document, formatted in Markdown. For mathematical expressions, use LaTeX notation with \( and \) for inline equations and \[ and \] for display equations. Convert any tables into Markdown format."
|
||||||
|
|
||||||
|
|
||||||
def claude_response_format_schema() -> dict:
|
def claude_response_format_schema() -> dict:
|
||||||
return (
|
return (
|
||||||
{
|
{
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
|
@ -582,7 +582,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
<ul>
|
<ul>
|
||||||
<li><strong>Yes</strong> - If the document appears to be a publication, research paper, public information, etc.</li>
|
<li><strong>Yes</strong> - If the document appears to be a publication, research paper, public information, etc.</li>
|
||||||
<li><strong>No</strong> - If the document appears to be private, personal, or not intended for public release</li>
|
<li><strong>No</strong> - If the document appears to be private, personal, or not intended for public release</li>
|
||||||
<li><strong>I cannot read it</strong> - If you are unable to read the page (e.g., foreign language, poor quality)</li>
|
<li><strong>Cannot Read</strong> - If you are unable to read the page (e.g., foreign language, poor quality)</li>
|
||||||
<li><strong>Report Content</strong> - If the content is inappropriate or disturbing</li>
|
<li><strong>Report Content</strong> - If the content is inappropriate or disturbing</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
@ -669,9 +669,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
html_content += f"""
|
html_content += f"""
|
||||||
<div class="page-container" data-index="{i}">
|
<div class="page-container" data-index="{i}">
|
||||||
<div class="page-info">
|
<div class="page-info">
|
||||||
<h2 title="{pdf_path}">{original_url}</h2>
|
<p>{f'<a href="{presigned_url}#page={page_num}" target="_blank">View Cached PDF (page {page_num})</a>' if presigned_url else pdf_path}</p>
|
||||||
<p>Page {page_num}</p>
|
|
||||||
<p>{f'<a href="{presigned_url}" target="_blank">View Cached PDF</a>' if presigned_url else pdf_path}</p>
|
|
||||||
<p>
|
<p>
|
||||||
Status: <span class="annotation-status status-pending" id="status-{i}">Pending</span>
|
Status: <span class="annotation-status status-pending" id="status-{i}">Pending</span>
|
||||||
</p>
|
</p>
|
||||||
@ -685,7 +683,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
<span class="btn-group">
|
<span class="btn-group">
|
||||||
<button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
|
<button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
|
||||||
<button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
|
<button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
|
||||||
<button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">I cannot read it</button>
|
<button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">Cannot Read</button>
|
||||||
<button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
|
<button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
@ -700,7 +698,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||||
</div>
|
</div>
|
||||||
<textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
<textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||||
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
<button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
<div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
||||||
@ -717,7 +715,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||||
</div>
|
</div>
|
||||||
<textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
<textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||||
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
<button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -733,9 +731,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
html_content += f"""
|
html_content += f"""
|
||||||
<div class="page-container" data-index="{i}">
|
<div class="page-container" data-index="{i}">
|
||||||
<div class="page-info">
|
<div class="page-info">
|
||||||
<h2 title="{pdf_path}">original_url</h2>
|
<p>{f'<a href="{presigned_url}#page={page_num}" target="_blank">View Cached PDF (page {page_num})</a>' if presigned_url else pdf_path}</p>
|
||||||
<p>Page {page_num}</p>
|
|
||||||
<p>{f'<a href="{presigned_url}" target="_blank">View Cached PDF</a>' if presigned_url else pdf_path}</p>
|
|
||||||
<p>
|
<p>
|
||||||
Status: <span class="annotation-status status-pending" id="status-{i}">Pending</span>
|
Status: <span class="annotation-status status-pending" id="status-{i}">Pending</span>
|
||||||
</p>
|
</p>
|
||||||
@ -747,7 +743,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
<span class="btn-group">
|
<span class="btn-group">
|
||||||
<button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
|
<button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
|
||||||
<button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
|
<button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
|
||||||
<button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">I cannot read it</button>
|
<button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">Cannot Read</button>
|
||||||
<button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
|
<button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
@ -762,7 +758,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||||
</div>
|
</div>
|
||||||
<textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
<textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||||
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
<button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
<div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
||||||
@ -779,7 +775,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||||
</div>
|
</div>
|
||||||
<textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
<textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||||
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
<button type="button" class="continue-button" onclick="saveThenNext(this)">Continue</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -911,8 +907,9 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
// If Enter key is pressed and not with Shift key, move to next document
|
// If Enter key is pressed and not with Shift key, move to next document
|
||||||
if (event.key === 'Enter' && !event.shiftKey) {
|
if (event.key === 'Enter' && !event.shiftKey) {
|
||||||
event.preventDefault();
|
event.preventDefault();
|
||||||
saveFeedback(textarea);
|
saveFeedback(textarea).then(() => {
|
||||||
goToNextDocument();
|
goToNextDocument();
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -955,6 +952,13 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
await putDatastore(datastore);
|
await putDatastore(datastore);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function saveThenNext(btn) {
|
||||||
|
const interfaceDiv = btn.closest('.annotation-interface');
|
||||||
|
saveFeedback(interfaceDiv).then(() => {
|
||||||
|
goToNextDocument();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function togglePrimaryOption(btn, index) {
|
function togglePrimaryOption(btn, index) {
|
||||||
const interfaceDiv = btn.closest('.annotation-interface');
|
const interfaceDiv = btn.closest('.annotation-interface');
|
||||||
// Remove active class from all primary option buttons in this group
|
// Remove active class from all primary option buttons in this group
|
||||||
@ -969,6 +973,9 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
document.querySelector(`#public-pii-options-${index}`).style.display = 'none';
|
document.querySelector(`#public-pii-options-${index}`).style.display = 'none';
|
||||||
document.querySelector(`#private-pii-options-${index}`).style.display = 'none';
|
document.querySelector(`#private-pii-options-${index}`).style.display = 'none';
|
||||||
|
|
||||||
|
// Immediately save the primary option selection
|
||||||
|
saveFeedback(interfaceDiv);
|
||||||
|
|
||||||
const option = btn.getAttribute('data-value');
|
const option = btn.getAttribute('data-value');
|
||||||
|
|
||||||
// Show the appropriate secondary options based on the selected primary option
|
// Show the appropriate secondary options based on the selected primary option
|
||||||
@ -978,7 +985,6 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
document.querySelector(`#private-pii-options-${index}`).style.display = 'block';
|
document.querySelector(`#private-pii-options-${index}`).style.display = 'block';
|
||||||
} else {
|
} else {
|
||||||
// For "cannot-read" or "report-content", just save and move to next
|
// For "cannot-read" or "report-content", just save and move to next
|
||||||
saveFeedback(interfaceDiv);
|
|
||||||
goToNextDocument();
|
goToNextDocument();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1000,7 +1006,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
|
|
||||||
function saveCheckboxes(input) {
|
function saveCheckboxes(input) {
|
||||||
const interfaceDiv = input.closest('.annotation-interface');
|
const interfaceDiv = input.closest('.annotation-interface');
|
||||||
saveFeedback(interfaceDiv);
|
return saveFeedback(interfaceDiv);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to deobfuscate the Prolific code
|
// Function to deobfuscate the Prolific code
|
||||||
@ -1223,80 +1229,62 @@ def process_annotations(annotations_by_link: List[Tuple[Dict[str, Any], str]]) -
|
|||||||
|
|
||||||
primary_option = annotation["primaryOption"]
|
primary_option = annotation["primaryOption"]
|
||||||
pdf_path = annotation.get("pdfPath", "Unknown")
|
pdf_path = annotation.get("pdfPath", "Unknown")
|
||||||
|
|
||||||
# Build a result item based on the new annotation structure
|
# Build a result item based on the new annotation structure
|
||||||
if primary_option == "yes-public":
|
if primary_option == "yes-public":
|
||||||
# Public document with potential PII
|
# Public document with potential PII
|
||||||
public_pii_options = annotation.get("publicPiiOptions", [])
|
public_pii_options = annotation.get("publicPiiOptions", [])
|
||||||
other_desc = annotation.get("otherPublicDesc", "")
|
other_desc = annotation.get("otherPublicDesc", "")
|
||||||
|
|
||||||
if not public_pii_options:
|
if not public_pii_options:
|
||||||
# No PII selected in a public document
|
# No PII selected in a public document
|
||||||
results["public_document"].append({
|
results["public_document"].append(
|
||||||
"page_id": page_id,
|
{"page_id": page_id, "link": link, "pdf_path": pdf_path, "pii_types": [], "has_pii": False, "description": ""}
|
||||||
"link": link,
|
)
|
||||||
"pdf_path": pdf_path,
|
|
||||||
"pii_types": [],
|
|
||||||
"has_pii": False,
|
|
||||||
"description": ""
|
|
||||||
})
|
|
||||||
else:
|
else:
|
||||||
# PII found in a public document
|
# PII found in a public document
|
||||||
results["public_document"].append({
|
results["public_document"].append(
|
||||||
"page_id": page_id,
|
{
|
||||||
"link": link,
|
"page_id": page_id,
|
||||||
"pdf_path": pdf_path,
|
"link": link,
|
||||||
"pii_types": public_pii_options,
|
"pdf_path": pdf_path,
|
||||||
"has_pii": True,
|
"pii_types": public_pii_options,
|
||||||
"description": other_desc if "other" in public_pii_options else ""
|
"has_pii": True,
|
||||||
})
|
"description": other_desc if "other" in public_pii_options else "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
elif primary_option == "no-public":
|
elif primary_option == "no-public":
|
||||||
# Private document with potential PII
|
# Private document with potential PII
|
||||||
private_pii_options = annotation.get("privatePiiOptions", [])
|
private_pii_options = annotation.get("privatePiiOptions", [])
|
||||||
other_desc = annotation.get("otherPrivateDesc", "")
|
other_desc = annotation.get("otherPrivateDesc", "")
|
||||||
|
|
||||||
if not private_pii_options:
|
if not private_pii_options:
|
||||||
# No PII selected in a private document
|
# No PII selected in a private document
|
||||||
results["private_document"].append({
|
results["private_document"].append(
|
||||||
"page_id": page_id,
|
{"page_id": page_id, "link": link, "pdf_path": pdf_path, "pii_types": [], "has_pii": False, "description": ""}
|
||||||
"link": link,
|
)
|
||||||
"pdf_path": pdf_path,
|
|
||||||
"pii_types": [],
|
|
||||||
"has_pii": False,
|
|
||||||
"description": ""
|
|
||||||
})
|
|
||||||
else:
|
else:
|
||||||
# PII found in a private document
|
# PII found in a private document
|
||||||
results["private_document"].append({
|
results["private_document"].append(
|
||||||
"page_id": page_id,
|
{
|
||||||
"link": link,
|
"page_id": page_id,
|
||||||
"pdf_path": pdf_path,
|
"link": link,
|
||||||
"pii_types": private_pii_options,
|
"pdf_path": pdf_path,
|
||||||
"has_pii": True,
|
"pii_types": private_pii_options,
|
||||||
"description": other_desc if "other" in private_pii_options else ""
|
"has_pii": True,
|
||||||
})
|
"description": other_desc if "other" in private_pii_options else "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
elif primary_option == "cannot-read":
|
elif primary_option == "cannot-read":
|
||||||
results["cannot_read"].append({
|
results["cannot_read"].append({"page_id": page_id, "link": link, "pdf_path": pdf_path})
|
||||||
"page_id": page_id,
|
|
||||||
"link": link,
|
|
||||||
"pdf_path": pdf_path
|
|
||||||
})
|
|
||||||
|
|
||||||
elif primary_option == "report-content":
|
elif primary_option == "report-content":
|
||||||
results["report_content"].append({
|
results["report_content"].append({"page_id": page_id, "link": link, "pdf_path": pdf_path})
|
||||||
"page_id": page_id,
|
|
||||||
"link": link,
|
|
||||||
"pdf_path": pdf_path
|
|
||||||
})
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
results["no_annotation"].append({
|
results["no_annotation"].append({"page_id": page_id, "link": link, "pdf_path": pdf_path})
|
||||||
"page_id": page_id,
|
|
||||||
"link": link,
|
|
||||||
"pdf_path": pdf_path
|
|
||||||
})
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@ -1310,23 +1298,31 @@ def print_annotation_report(annotation_results: Dict[str, List[Dict[str, Any]]])
|
|||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
|
|
||||||
# Count pages with PII in public documents
|
# Count pages with PII in public documents
|
||||||
public_with_pii = [page for page in annotation_results['public_document'] if page.get('has_pii', False)]
|
public_with_pii = [page for page in annotation_results["public_document"] if page.get("has_pii", False)]
|
||||||
public_without_pii = [page for page in annotation_results['public_document'] if not page.get('has_pii', False)]
|
public_without_pii = [page for page in annotation_results["public_document"] if not page.get("has_pii", False)]
|
||||||
|
|
||||||
# Count pages with PII in private documents
|
# Count pages with PII in private documents
|
||||||
private_with_pii = [page for page in annotation_results['private_document'] if page.get('has_pii', False)]
|
private_with_pii = [page for page in annotation_results["private_document"] if page.get("has_pii", False)]
|
||||||
private_without_pii = [page for page in annotation_results['private_document'] if not page.get('has_pii', False)]
|
private_without_pii = [page for page in annotation_results["private_document"] if not page.get("has_pii", False)]
|
||||||
|
|
||||||
# Print summary statistics
|
# Print summary statistics
|
||||||
print("\nSummary:")
|
print("\nSummary:")
|
||||||
print(f" Public documents (total): {len(annotation_results['public_document'])} ({len(annotation_results['public_document'])/total_pages*100:.1f}% of all pages)")
|
print(
|
||||||
|
f" Public documents (total): {len(annotation_results['public_document'])} ({len(annotation_results['public_document'])/total_pages*100:.1f}% of all pages)"
|
||||||
|
)
|
||||||
print(f" - With PII: {len(public_with_pii)} ({len(public_with_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)")
|
print(f" - With PII: {len(public_with_pii)} ({len(public_with_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)")
|
||||||
print(f" - Without PII: {len(public_without_pii)} ({len(public_without_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)")
|
print(
|
||||||
|
f" - Without PII: {len(public_without_pii)} ({len(public_without_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)"
|
||||||
print(f" Private documents (total): {len(annotation_results['private_document'])} ({len(annotation_results['private_document'])/total_pages*100:.1f}% of all pages)")
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" Private documents (total): {len(annotation_results['private_document'])} ({len(annotation_results['private_document'])/total_pages*100:.1f}% of all pages)"
|
||||||
|
)
|
||||||
print(f" - With PII: {len(private_with_pii)} ({len(private_with_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)")
|
print(f" - With PII: {len(private_with_pii)} ({len(private_with_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)")
|
||||||
print(f" - Without PII: {len(private_without_pii)} ({len(private_without_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)")
|
print(
|
||||||
|
f" - Without PII: {len(private_without_pii)} ({len(private_without_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)"
|
||||||
|
)
|
||||||
|
|
||||||
print(f" Unreadable pages: {len(annotation_results['cannot_read'])} ({len(annotation_results['cannot_read'])/total_pages*100:.1f}%)")
|
print(f" Unreadable pages: {len(annotation_results['cannot_read'])} ({len(annotation_results['cannot_read'])/total_pages*100:.1f}%)")
|
||||||
print(f" Pages with reported content: {len(annotation_results['report_content'])} ({len(annotation_results['report_content'])/total_pages*100:.1f}%)")
|
print(f" Pages with reported content: {len(annotation_results['report_content'])} ({len(annotation_results['report_content'])/total_pages*100:.1f}%)")
|
||||||
print(f" Pages without annotation: {len(annotation_results['no_annotation'])} ({len(annotation_results['no_annotation'])/total_pages*100:.1f}%)")
|
print(f" Pages without annotation: {len(annotation_results['no_annotation'])} ({len(annotation_results['no_annotation'])/total_pages*100:.1f}%)")
|
||||||
@ -1335,9 +1331,9 @@ def print_annotation_report(annotation_results: Dict[str, List[Dict[str, Any]]])
|
|||||||
if public_with_pii:
|
if public_with_pii:
|
||||||
pii_counts_public = {}
|
pii_counts_public = {}
|
||||||
for page in public_with_pii:
|
for page in public_with_pii:
|
||||||
for pii_type in page.get('pii_types', []):
|
for pii_type in page.get("pii_types", []):
|
||||||
pii_counts_public[pii_type] = pii_counts_public.get(pii_type, 0) + 1
|
pii_counts_public[pii_type] = pii_counts_public.get(pii_type, 0) + 1
|
||||||
|
|
||||||
print("\nPII Types in Public Documents:")
|
print("\nPII Types in Public Documents:")
|
||||||
for pii_type, count in sorted(pii_counts_public.items(), key=lambda x: x[1], reverse=True):
|
for pii_type, count in sorted(pii_counts_public.items(), key=lambda x: x[1], reverse=True):
|
||||||
print(f" - {pii_type}: {count} ({count/len(public_with_pii)*100:.1f}%)")
|
print(f" - {pii_type}: {count} ({count/len(public_with_pii)*100:.1f}%)")
|
||||||
@ -1346,9 +1342,9 @@ def print_annotation_report(annotation_results: Dict[str, List[Dict[str, Any]]])
|
|||||||
if private_with_pii:
|
if private_with_pii:
|
||||||
pii_counts_private = {}
|
pii_counts_private = {}
|
||||||
for page in private_with_pii:
|
for page in private_with_pii:
|
||||||
for pii_type in page.get('pii_types', []):
|
for pii_type in page.get("pii_types", []):
|
||||||
pii_counts_private[pii_type] = pii_counts_private.get(pii_type, 0) + 1
|
pii_counts_private[pii_type] = pii_counts_private.get(pii_type, 0) + 1
|
||||||
|
|
||||||
print("\nPII Types in Private Documents:")
|
print("\nPII Types in Private Documents:")
|
||||||
for pii_type, count in sorted(pii_counts_private.items(), key=lambda x: x[1], reverse=True):
|
for pii_type, count in sorted(pii_counts_private.items(), key=lambda x: x[1], reverse=True):
|
||||||
print(f" - {pii_type}: {count} ({count/len(private_with_pii)*100:.1f}%)")
|
print(f" - {pii_type}: {count} ({count/len(private_with_pii)*100:.1f}%)")
|
||||||
@ -1362,7 +1358,7 @@ def print_annotation_report(annotation_results: Dict[str, List[Dict[str, Any]]])
|
|||||||
print(f" Page ID: {item['page_id']}")
|
print(f" Page ID: {item['page_id']}")
|
||||||
print(f" Link: {item['link']}#{item['page_id']}")
|
print(f" Link: {item['link']}#{item['page_id']}")
|
||||||
print(f" PII Types: {', '.join(item['pii_types'])}")
|
print(f" PII Types: {', '.join(item['pii_types'])}")
|
||||||
if item.get('description'):
|
if item.get("description"):
|
||||||
print(f" Description: {item['description']}")
|
print(f" Description: {item['description']}")
|
||||||
print("-" * 80)
|
print("-" * 80)
|
||||||
|
|
||||||
@ -1375,7 +1371,7 @@ def print_annotation_report(annotation_results: Dict[str, List[Dict[str, Any]]])
|
|||||||
print(f" Page ID: {item['page_id']}")
|
print(f" Page ID: {item['page_id']}")
|
||||||
print(f" Link: {item['link']}#{item['page_id']}")
|
print(f" Link: {item['link']}#{item['page_id']}")
|
||||||
print(f" PII Types: {', '.join(item['pii_types'])}")
|
print(f" PII Types: {', '.join(item['pii_types'])}")
|
||||||
if item.get('description'):
|
if item.get("description"):
|
||||||
print(f" Description: {item['description']}")
|
print(f" Description: {item['description']}")
|
||||||
print("-" * 80)
|
print("-" * 80)
|
||||||
|
|
||||||
@ -1435,16 +1431,8 @@ def read_and_process_results(args):
|
|||||||
doc_type = ""
|
doc_type = ""
|
||||||
pii_types = ""
|
pii_types = ""
|
||||||
description = ""
|
description = ""
|
||||||
|
|
||||||
writer.writerow([
|
writer.writerow([category, item["pdf_path"], item["page_id"], f"{item['link']}#{item['page_id']}", doc_type, pii_types, description])
|
||||||
category,
|
|
||||||
item["pdf_path"],
|
|
||||||
item["page_id"],
|
|
||||||
f"{item['link']}#{item['page_id']}",
|
|
||||||
doc_type,
|
|
||||||
pii_types,
|
|
||||||
description
|
|
||||||
])
|
|
||||||
|
|
||||||
print(f"Report saved to {output_file}")
|
print(f"Report saved to {output_file}")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user