mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-16 18:39:29 +00:00
Merge branch 'main' of https://github.com/allenai/olmocr into main
This commit is contained in:
commit
cf0d07d8d7
@ -243,6 +243,9 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
color: var(--text-color);
|
color: var(--text-color);
|
||||||
background-color: var(--bg-color);
|
background-color: var(--bg-color);
|
||||||
padding: 2rem;
|
padding: 2rem;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: row;
|
||||||
|
gap: 2rem;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
ul {{
|
ul {{
|
||||||
@ -250,15 +253,23 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
}}
|
}}
|
||||||
|
|
||||||
.container {{
|
.container {{
|
||||||
max-width: 1200px;
|
flex: 2;
|
||||||
margin: 0 auto;
|
max-width: 750px;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
header {{
|
header {{
|
||||||
margin-bottom: 2rem;
|
position: sticky;
|
||||||
border-bottom: 1px solid var(--border-color);
|
top: 2rem;
|
||||||
padding-bottom: 1rem;
|
flex: 1;
|
||||||
|
min-width: 380px;
|
||||||
|
max-width: 420px;
|
||||||
|
max-height: calc(100vh - 4rem);
|
||||||
|
overflow-y: auto;
|
||||||
|
padding: 1.5rem;
|
||||||
|
background-color: white;
|
||||||
|
border-radius: 0.5rem;
|
||||||
|
box-shadow: var(--card-shadow);
|
||||||
|
align-self: flex-start;
|
||||||
font-size: small;
|
font-size: small;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
@ -296,7 +307,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
|
|
||||||
.page-grid {{
|
.page-grid {{
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: repeat(2, 1fr);
|
grid-template-columns: 1fr;
|
||||||
gap: 2rem;
|
gap: 2rem;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
@ -377,6 +388,15 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
display: block; /* Show only the active annotation interface */
|
display: block; /* Show only the active annotation interface */
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
.question-container {{
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.question-text {{
|
||||||
|
font-weight: 500;
|
||||||
|
margin-bottom: 0.5rem;
|
||||||
|
}}
|
||||||
|
|
||||||
/* Button group styling for connected buttons */
|
/* Button group styling for connected buttons */
|
||||||
.btn-group {{
|
.btn-group {{
|
||||||
display: inline-flex;
|
display: inline-flex;
|
||||||
@ -413,10 +433,50 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
color: white;
|
color: white;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
.checkbox-group {{
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 0.5rem;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.checkbox-group label {{
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
padding: 0.25rem 0.5rem;
|
||||||
|
background-color: #f1f5f9;
|
||||||
|
border-radius: 0.25rem;
|
||||||
|
cursor: pointer;
|
||||||
|
font-size: 0.875rem;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.checkbox-group label:hover {{
|
||||||
|
background-color: #e2e8f0;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.checkbox-group input[type="checkbox"] {{
|
||||||
|
margin-right: 0.5rem;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.continue-button {{
|
||||||
|
padding: 0.5rem 1rem;
|
||||||
|
background-color: var(--primary-color);
|
||||||
|
color: white;
|
||||||
|
border: none;
|
||||||
|
border-radius: 0.25rem;
|
||||||
|
cursor: pointer;
|
||||||
|
font-weight: 500;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.continue-button:hover {{
|
||||||
|
background-color: #1d4ed8;
|
||||||
|
}}
|
||||||
|
|
||||||
.annotation-interface textarea {{
|
.annotation-interface textarea {{
|
||||||
display: none; /* Hide textarea by default */
|
display: none; /* Hide textarea by default */
|
||||||
width: 100%;
|
width: 100%;
|
||||||
margin-top: 0.5rem;
|
margin-top: 0.5rem;
|
||||||
|
margin-bottom: 1rem;
|
||||||
padding: 0.5rem;
|
padding: 0.5rem;
|
||||||
font-size: 0.875rem;
|
font-size: 0.875rem;
|
||||||
border: 1px solid var(--border-color);
|
border: 1px solid var(--border-color);
|
||||||
@ -494,53 +554,71 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
@media (max-width: 768px) {{
|
@media (max-width: 768px) {{
|
||||||
body {{
|
body {{
|
||||||
padding: 1rem;
|
padding: 1rem;
|
||||||
|
flex-direction: column;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
.page-grid {{
|
header {{
|
||||||
grid-template-columns: 1fr;
|
position: static;
|
||||||
|
max-width: 100%;
|
||||||
|
margin-left: 0;
|
||||||
|
margin-bottom: 2rem;
|
||||||
|
}}
|
||||||
|
|
||||||
|
.container {{
|
||||||
|
max-width: 100%;
|
||||||
}}
|
}}
|
||||||
}}
|
}}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div class="container">
|
<header>
|
||||||
<header>
|
|
||||||
<h2>Task Instructions</h2>
|
<h2>Task Instructions</h2>
|
||||||
<p>Your task is to review {len(random_pages)} document pages and determine whether they contain any <strong>Personally Identifiable Information (PII)</strong>. Carefully but efficiently inspect each page and select the appropriate response. You do not need to read every word - quickly scan the page and look for any obvious PII. The time expected to complete this task is 10-15 minutes.</p>
|
<p>Your task is to review {len(random_pages)} document pages and determine whether they contain any <strong>Personally Identifiable Information (PII)</strong>. Carefully but efficiently inspect each page and select the appropriate response. You do not need to read every word - quickly scan the page and look for any obvious PII. The time expected to complete this task is 10-15 minutes.</p>
|
||||||
|
|
||||||
<h2>How to Annotate</h2>
|
<h2>How to Annotate</h2>
|
||||||
<p>The page you are currently annotating will be highlighted with a blue outline and a set of response buttons will be displayed directly below it.</p>
|
<p>The page you are currently annotating will be highlighted with a blue outline and a set of questions will be displayed directly below it.</p>
|
||||||
<br/>
|
<br/>
|
||||||
<p><strong>Yes PII</strong> - Select this if you find any information on the page that qualifies as PII. A text box will appear below - briefly describe the kind of PII you encountered (e.g., full name, social security number, etc.) then press the Enter key.</p>
|
<p><strong>First question:</strong> Is this document meant for public dissemination?</p>
|
||||||
<p><strong>No PII</strong> - Select this if the page does not contain any PII.</p>
|
<ul>
|
||||||
<p><strong>I cannot read this</strong> - Select this if you are unable to read the page for any reason (e.g., written in a language other than English, heavily redacted text, etc.)</p>
|
<li><strong>Yes</strong> - If the document appears to be a publication, research paper, public information, etc.</li>
|
||||||
<p><strong>Disturbing content</strong> - Select this if the page contains disturbing or graphic content.</p>
|
<li><strong>No</strong> - If the document appears to be private, personal, or not intended for public release</li>
|
||||||
|
<li><strong>I cannot read it</strong> - If you are unable to read the page (e.g., foreign language, poor quality)</li>
|
||||||
|
<li><strong>Report Content</strong> - If the content is inappropriate or disturbing</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
<br/>
|
<p><strong>Second question:</strong> Depending on your first answer, you'll be asked to identify any PII in the document:</p>
|
||||||
<p>You may edit your annotations any time before submitting. To do so, press the green Edit button directly above the page.</p>
|
<ul>
|
||||||
<p>After completing all the document pages on this screen, you will receive a Prolific completion code.</p>
|
<li>For <strong>public</strong> documents, select from: SSN, Bank Info, Credit Card Info, Usernames/Passwords, Other</li>
|
||||||
|
<li>For <strong>private</strong> documents, select from: Full Names, Addresses, Contact Info, Personal Attributes, SSN, Bank Info, Credit Card Info, Usernames/Passwords, Other</li>
|
||||||
|
</ul>
|
||||||
|
<p>You can select multiple PII types. If you select "Other", a text box will appear where you can describe the PII.</p>
|
||||||
|
|
||||||
<h2>What Counts as PII?</h2>
|
<br/>
|
||||||
<ul>
|
<p>You may edit your annotations any time before submitting. To do so, press the green Edit button directly above the page.</p>
|
||||||
<li><strong>Names</strong>: Full names, first names, last names, nicknames, maiden names, aliases</li>
|
<p>After completing all the document pages on this screen, you will receive a Prolific completion code.</p>
|
||||||
<li><strong>Addresses</strong>: Street addresses, postal codes, cities, states, countries</li>
|
|
||||||
<li><strong>Contact Information</strong>: Phone numbers, email addresses</li>
|
|
||||||
<li><strong>Government IDs</strong>: SSNs, passport numbers, driver's license numbers, tax IDs</li>
|
|
||||||
<li><strong>Financial Information</strong>: Credit card numbers, bank account numbers, routing numbers</li>
|
|
||||||
<li><strong>Biometric Data</strong>: Fingerprints, retina scans, facial recognition data, voice signatures</li>
|
|
||||||
<li><strong>Personal Attributes</strong>: Date of birth, place of birth, gender, race, religion</li>
|
|
||||||
<li><strong>Online Identifiers</strong>: IP addresses, login IDs, usernames, passwords, API keys, URLs</li>
|
|
||||||
<li><strong>Location Information</strong>: Geolocations, specific coordinates</li>
|
|
||||||
<li><strong>Employment Information</strong>: Job titles, workplace names, employment history</li>
|
|
||||||
<li><strong>Education Information</strong>: School names, degrees, transcripts</li>
|
|
||||||
<li><strong>Medical Information</strong>: Health records, diagnoses</li>
|
|
||||||
<li><strong>Company Names</strong>: If they are tied to an individual's identity (e.g., a person's personal business)</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<h2>What NOT to Mark as PII</h2>
|
<h2>What Counts as PII?</h2>
|
||||||
<p><strong>Author names, researcher names, citations, or references from published research papers</strong> should NOT be marked as PII. These names are part of the normal publication process and are not considered private or sensitive information for the purposes of this task.
|
<ul>
|
||||||
Only mark information as PII if it relates to private, sensitive, or personal details about an individual outside the context of the publication.</p>
|
<li><strong>Names</strong>: Full names, first names, last names, nicknames, maiden names, aliases</li>
|
||||||
</header>
|
<li><strong>Addresses</strong>: Street addresses, postal codes, cities, states, countries</li>
|
||||||
|
<li><strong>Contact Information</strong>: Phone numbers, email addresses</li>
|
||||||
|
<li><strong>Government IDs</strong>: SSNs, passport numbers, driver's license numbers, tax IDs</li>
|
||||||
|
<li><strong>Financial Information</strong>: Credit card numbers, bank account numbers, routing numbers</li>
|
||||||
|
<li><strong>Biometric Data</strong>: Fingerprints, retina scans, facial recognition data, voice signatures</li>
|
||||||
|
<li><strong>Personal Attributes</strong>: Date of birth, place of birth, gender, race, religion</li>
|
||||||
|
<li><strong>Online Identifiers</strong>: IP addresses, login IDs, usernames, passwords, API keys, URLs</li>
|
||||||
|
<li><strong>Location Information</strong>: Geolocations, specific coordinates</li>
|
||||||
|
<li><strong>Employment Information</strong>: Job titles, workplace names, employment history</li>
|
||||||
|
<li><strong>Education Information</strong>: School names, degrees, transcripts</li>
|
||||||
|
<li><strong>Medical Information</strong>: Health records, diagnoses</li>
|
||||||
|
<li><strong>Company Names</strong>: If they are tied to an individual's identity (e.g., a person's personal business)</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h2>What NOT to Mark as PII</h2>
|
||||||
|
<p><strong>Author names, researcher names, citations, or references from published research papers</strong> should NOT be marked as PII. These names are part of the normal publication process and are not considered private or sensitive information for the purposes of this task.
|
||||||
|
Only mark information as PII if it relates to private, sensitive, or personal details about an individual outside the context of the publication.</p>
|
||||||
|
</header>
|
||||||
|
<div class="container">
|
||||||
|
|
||||||
<div class="info-bar">
|
<div class="info-bar">
|
||||||
<div class="info-item">
|
<div class="info-item">
|
||||||
@ -591,7 +669,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
html_content += f"""
|
html_content += f"""
|
||||||
<div class="page-container" data-index="{i}">
|
<div class="page-container" data-index="{i}">
|
||||||
<div class="page-info">
|
<div class="page-info">
|
||||||
<h2 title="{pdf_path}"><a href="{original_url}" target="_blank">{original_url}</a></h2>
|
<h2 title="{pdf_path}">{original_url}</h2>
|
||||||
<p>Page {page_num}</p>
|
<p>Page {page_num}</p>
|
||||||
<p>{f'<a href="{presigned_url}" target="_blank">View Cached PDF</a>' if presigned_url else pdf_path}</p>
|
<p>{f'<a href="{presigned_url}" target="_blank">View Cached PDF</a>' if presigned_url else pdf_path}</p>
|
||||||
<p>
|
<p>
|
||||||
@ -602,13 +680,45 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
<img class="page-image" src="data:image/webp;base64,{base64_image}" alt="PDF Page {page_num}" loading="lazy" />
|
<img class="page-image" src="data:image/webp;base64,{base64_image}" alt="PDF Page {page_num}" loading="lazy" />
|
||||||
</div>
|
</div>
|
||||||
<div class="annotation-interface{active_class}" data-id="page-{i}" data-pdf-path="{pdf_path}">
|
<div class="annotation-interface{active_class}" data-id="page-{i}" data-pdf-path="{pdf_path}">
|
||||||
<span class="btn-group">
|
<div class="question-container" id="question1-{i}">
|
||||||
<button type="button" class="toggle-button feedback-option" data-value="yes-pii" onclick="toggleFeedbackOption(this)">Yes PII</button>
|
<p class="question-text">Is this document meant for public dissemination?</p>
|
||||||
<button type="button" class="toggle-button feedback-option" data-value="no-pii" onclick="toggleFeedbackOption(this)">No PII</button>
|
<span class="btn-group">
|
||||||
<button type="button" class="toggle-button feedback-option" data-value="cannot-read" onclick="toggleFeedbackOption(this)">I cannot read this</button>
|
<button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
|
||||||
<button type="button" class="toggle-button feedback-option" data-value="disturbing" onclick="toggleFeedbackOption(this)">Disturbing content</button>
|
<button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
|
||||||
</span>
|
<button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">I cannot read it</button>
|
||||||
<textarea placeholder="Describe any private PII in the document" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
<button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="question-container" id="public-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
||||||
|
<p class="question-text">Select any PII found in this public document:</p>
|
||||||
|
<div class="checkbox-group">
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||||
|
</div>
|
||||||
|
<textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||||
|
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
||||||
|
<p class="question-text">Select any PII found in this private document:</p>
|
||||||
|
<div class="checkbox-group">
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="full-names" onchange="saveCheckboxes(this)"> Full Names</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="addresses" onchange="saveCheckboxes(this)"> Addresses</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="contact-info" onchange="saveCheckboxes(this)"> Contact Info</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="personal-attributes" onchange="saveCheckboxes(this)"> Personal Attributes</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||||
|
</div>
|
||||||
|
<textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||||
|
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
"""
|
"""
|
||||||
@ -623,7 +733,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
html_content += f"""
|
html_content += f"""
|
||||||
<div class="page-container" data-index="{i}">
|
<div class="page-container" data-index="{i}">
|
||||||
<div class="page-info">
|
<div class="page-info">
|
||||||
<h2 title="{pdf_path}"><a href="{original_url}" target="_blank">{original_url}</a></h2>
|
<h2 title="{pdf_path}">original_url</h2>
|
||||||
<p>Page {page_num}</p>
|
<p>Page {page_num}</p>
|
||||||
<p>{f'<a href="{presigned_url}" target="_blank">View Cached PDF</a>' if presigned_url else pdf_path}</p>
|
<p>{f'<a href="{presigned_url}" target="_blank">View Cached PDF</a>' if presigned_url else pdf_path}</p>
|
||||||
<p>
|
<p>
|
||||||
@ -632,13 +742,45 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
</div>
|
</div>
|
||||||
<div class="error">Error: {str(e)}</div>
|
<div class="error">Error: {str(e)}</div>
|
||||||
<div class="annotation-interface{active_class}" data-id="page-{i}" data-pdf-path="{pdf_path}">
|
<div class="annotation-interface{active_class}" data-id="page-{i}" data-pdf-path="{pdf_path}">
|
||||||
<span class="btn-group">
|
<div class="question-container" id="question1-{i}">
|
||||||
<button type="button" class="toggle-button feedback-option" data-value="yes-pii" onclick="toggleFeedbackOption(this)">Yes PII</button>
|
<p class="question-text">Is this document meant for public dissemination?</p>
|
||||||
<button type="button" class="toggle-button feedback-option" data-value="no-pii" onclick="toggleFeedbackOption(this)">No PII</button>
|
<span class="btn-group">
|
||||||
<button type="button" class="toggle-button feedback-option" data-value="cannot-read" onclick="toggleFeedbackOption(this)">I cannot read this</button>
|
<button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
|
||||||
<button type="button" class="toggle-button feedback-option" data-value="disturbing" onclick="toggleFeedbackOption(this)">Disturbing content</button>
|
<button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
|
||||||
</span>
|
<button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">I cannot read it</button>
|
||||||
<textarea placeholder="Describe any private PII in the document" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
<button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="question-container" id="public-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
||||||
|
<p class="question-text">Select any PII found in this public document:</p>
|
||||||
|
<div class="checkbox-group">
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||||
|
</div>
|
||||||
|
<textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||||
|
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
||||||
|
<p class="question-text">Select any PII found in this private document:</p>
|
||||||
|
<div class="checkbox-group">
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="full-names" onchange="saveCheckboxes(this)"> Full Names</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="addresses" onchange="saveCheckboxes(this)"> Addresses</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="contact-info" onchange="saveCheckboxes(this)"> Contact Info</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="personal-attributes" onchange="saveCheckboxes(this)"> Personal Attributes</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
|
||||||
|
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||||
|
</div>
|
||||||
|
<textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||||
|
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
"""
|
"""
|
||||||
@ -777,45 +919,88 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
async function saveFeedback(source) {
|
async function saveFeedback(source) {
|
||||||
const interfaceDiv = source.closest('.annotation-interface');
|
const interfaceDiv = source.closest('.annotation-interface');
|
||||||
const id = interfaceDiv.getAttribute('data-id');
|
const id = interfaceDiv.getAttribute('data-id');
|
||||||
// Get the selected feedback option value
|
|
||||||
const activeButton = interfaceDiv.querySelector('button.feedback-option.active');
|
// Get the selected primary option
|
||||||
const feedbackOption = activeButton ? activeButton.getAttribute('data-value') : null;
|
const activePrimaryButton = interfaceDiv.querySelector('button.primary-option.active');
|
||||||
const piiDescription = interfaceDiv.querySelector('textarea').value;
|
const primaryOption = activePrimaryButton ? activePrimaryButton.getAttribute('data-value') : null;
|
||||||
|
|
||||||
|
// Get checkbox selections for public document
|
||||||
|
const publicPiiOptions = [];
|
||||||
|
interfaceDiv.querySelectorAll('#public-pii-options-' + id.split('-')[1] + ' input[type="checkbox"]:checked').forEach(checkbox => {
|
||||||
|
publicPiiOptions.push(checkbox.getAttribute('data-value'));
|
||||||
|
});
|
||||||
|
|
||||||
|
// Get checkbox selections for private document
|
||||||
|
const privatePiiOptions = [];
|
||||||
|
interfaceDiv.querySelectorAll('#private-pii-options-' + id.split('-')[1] + ' input[type="checkbox"]:checked').forEach(checkbox => {
|
||||||
|
privatePiiOptions.push(checkbox.getAttribute('data-value'));
|
||||||
|
});
|
||||||
|
|
||||||
|
// Get any "Other" descriptions
|
||||||
|
const otherPublicDesc = interfaceDiv.querySelector('#other-pii-public-' + id.split('-')[1])?.value || '';
|
||||||
|
const otherPrivateDesc = interfaceDiv.querySelector('#other-pii-private-' + id.split('-')[1])?.value || '';
|
||||||
|
|
||||||
const pdfPath = interfaceDiv.getAttribute('data-pdf-path');
|
const pdfPath = interfaceDiv.getAttribute('data-pdf-path');
|
||||||
|
|
||||||
const datastore = await fetchDatastore() || {};
|
const datastore = await fetchDatastore() || {};
|
||||||
datastore[id] = {
|
datastore[id] = {
|
||||||
feedbackOption: feedbackOption,
|
primaryOption: primaryOption,
|
||||||
piiDescription: piiDescription,
|
publicPiiOptions: publicPiiOptions,
|
||||||
|
privatePiiOptions: privatePiiOptions,
|
||||||
|
otherPublicDesc: otherPublicDesc,
|
||||||
|
otherPrivateDesc: otherPrivateDesc,
|
||||||
pdfPath: pdfPath
|
pdfPath: pdfPath
|
||||||
};
|
};
|
||||||
|
|
||||||
await putDatastore(datastore);
|
await putDatastore(datastore);
|
||||||
}
|
}
|
||||||
|
|
||||||
function toggleFeedbackOption(btn) {
|
function togglePrimaryOption(btn, index) {
|
||||||
const interfaceDiv = btn.closest('.annotation-interface');
|
const interfaceDiv = btn.closest('.annotation-interface');
|
||||||
// Remove active class from all feedback option buttons in this group
|
// Remove active class from all primary option buttons in this group
|
||||||
interfaceDiv.querySelectorAll('button.feedback-option').forEach(function(b) {
|
interfaceDiv.querySelectorAll('button.primary-option').forEach(function(b) {
|
||||||
b.classList.remove('active');
|
b.classList.remove('active');
|
||||||
});
|
});
|
||||||
|
|
||||||
// Toggle on the clicked button
|
// Toggle on the clicked button
|
||||||
btn.classList.add('active');
|
btn.classList.add('active');
|
||||||
saveFeedback(interfaceDiv);
|
|
||||||
|
|
||||||
// Show or hide textarea based on selected option
|
// Hide all secondary option containers
|
||||||
const textarea = interfaceDiv.querySelector('textarea');
|
document.querySelector(`#public-pii-options-${index}`).style.display = 'none';
|
||||||
const feedbackOption = btn.getAttribute('data-value');
|
document.querySelector(`#private-pii-options-${index}`).style.display = 'none';
|
||||||
|
|
||||||
if (feedbackOption === 'yes-pii') {
|
const option = btn.getAttribute('data-value');
|
||||||
// Only show textarea if "Yes PII" is selected
|
|
||||||
|
// Show the appropriate secondary options based on the selected primary option
|
||||||
|
if (option === 'yes-public') {
|
||||||
|
document.querySelector(`#public-pii-options-${index}`).style.display = 'block';
|
||||||
|
} else if (option === 'no-public') {
|
||||||
|
document.querySelector(`#private-pii-options-${index}`).style.display = 'block';
|
||||||
|
} else {
|
||||||
|
// For "cannot-read" or "report-content", just save and move to next
|
||||||
|
saveFeedback(interfaceDiv);
|
||||||
|
goToNextDocument();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function toggleOtherTextarea(checkbox) {
|
||||||
|
const container = checkbox.closest('.question-container');
|
||||||
|
const textareaId = container.querySelector('textarea').id;
|
||||||
|
const textarea = document.getElementById(textareaId);
|
||||||
|
|
||||||
|
if (checkbox.checked) {
|
||||||
textarea.style.display = 'block';
|
textarea.style.display = 'block';
|
||||||
textarea.focus();
|
textarea.focus();
|
||||||
} else {
|
} else {
|
||||||
// If other options selected, hide textarea and go to next
|
|
||||||
textarea.style.display = 'none';
|
textarea.style.display = 'none';
|
||||||
goToNextDocument();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
saveCheckboxes(checkbox);
|
||||||
|
}
|
||||||
|
|
||||||
|
function saveCheckboxes(input) {
|
||||||
|
const interfaceDiv = input.closest('.annotation-interface');
|
||||||
|
saveFeedback(interfaceDiv);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to deobfuscate the Prolific code
|
// Function to deobfuscate the Prolific code
|
||||||
@ -849,23 +1034,64 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
|
|
||||||
document.querySelectorAll('.annotation-interface').forEach(function(interfaceDiv) {
|
document.querySelectorAll('.annotation-interface').forEach(function(interfaceDiv) {
|
||||||
const id = interfaceDiv.getAttribute('data-id');
|
const id = interfaceDiv.getAttribute('data-id');
|
||||||
|
const pageIndex = id.split('-')[1];
|
||||||
|
|
||||||
if (datastore[id]) {
|
if (datastore[id]) {
|
||||||
const data = datastore[id];
|
const data = datastore[id];
|
||||||
// Set active state for feedback option buttons
|
|
||||||
interfaceDiv.querySelectorAll('button.feedback-option').forEach(function(btn) {
|
// Set active state for primary option buttons
|
||||||
if (btn.getAttribute('data-value') === data.feedbackOption) {
|
interfaceDiv.querySelectorAll('button.primary-option').forEach(function(btn) {
|
||||||
|
if (btn.getAttribute('data-value') === data.primaryOption) {
|
||||||
btn.classList.add('active');
|
btn.classList.add('active');
|
||||||
|
|
||||||
// Show textarea if "Yes PII" is selected
|
// Show the appropriate secondary options
|
||||||
if (btn.getAttribute('data-value') === 'yes-pii') {
|
const option = btn.getAttribute('data-value');
|
||||||
interfaceDiv.querySelector('textarea').style.display = 'block';
|
if (option === 'yes-public') {
|
||||||
|
document.querySelector(`#public-pii-options-${pageIndex}`).style.display = 'block';
|
||||||
|
} else if (option === 'no-public') {
|
||||||
|
document.querySelector(`#private-pii-options-${pageIndex}`).style.display = 'block';
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
btn.classList.remove('active');
|
btn.classList.remove('active');
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
// Set the textarea value
|
|
||||||
interfaceDiv.querySelector('textarea').value = data.piiDescription;
|
// Restore public PII checkboxes
|
||||||
|
if (data.publicPiiOptions && data.publicPiiOptions.length > 0) {
|
||||||
|
const publicContainer = document.querySelector(`#public-pii-options-${pageIndex}`);
|
||||||
|
data.publicPiiOptions.forEach(option => {
|
||||||
|
const checkbox = publicContainer.querySelector(`input[data-value="${option}"]`);
|
||||||
|
if (checkbox) {
|
||||||
|
checkbox.checked = true;
|
||||||
|
if (option === 'other') {
|
||||||
|
document.getElementById(`other-pii-public-${pageIndex}`).style.display = 'block';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Restore private PII checkboxes
|
||||||
|
if (data.privatePiiOptions && data.privatePiiOptions.length > 0) {
|
||||||
|
const privateContainer = document.querySelector(`#private-pii-options-${pageIndex}`);
|
||||||
|
data.privatePiiOptions.forEach(option => {
|
||||||
|
const checkbox = privateContainer.querySelector(`input[data-value="${option}"]`);
|
||||||
|
if (checkbox) {
|
||||||
|
checkbox.checked = true;
|
||||||
|
if (option === 'other') {
|
||||||
|
document.getElementById(`other-pii-private-${pageIndex}`).style.display = 'block';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the textarea values
|
||||||
|
if (data.otherPublicDesc) {
|
||||||
|
document.getElementById(`other-pii-public-${pageIndex}`).value = data.otherPublicDesc;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.otherPrivateDesc) {
|
||||||
|
document.getElementById(`other-pii-private-${pageIndex}`).value = data.otherPrivateDesc;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -873,7 +1099,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
|||||||
let lastAnnotatedIndex = -1;
|
let lastAnnotatedIndex = -1;
|
||||||
for (let i = 0; i < totalPages; i++) {
|
for (let i = 0; i < totalPages; i++) {
|
||||||
const pageId = `page-${i}`;
|
const pageId = `page-${i}`;
|
||||||
if (datastore[pageId] && datastore[pageId].feedbackOption) {
|
if (datastore[pageId] && datastore[pageId].primaryOption) {
|
||||||
lastAnnotatedIndex = i;
|
lastAnnotatedIndex = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -979,40 +1205,98 @@ def fetch_annotations(tinyhost_link: str) -> Tuple[Dict[str, Any], str]:
|
|||||||
def process_annotations(annotations_by_link: List[Tuple[Dict[str, Any], str]]) -> Dict[str, List[Dict[str, Any]]]:
|
def process_annotations(annotations_by_link: List[Tuple[Dict[str, Any], str]]) -> Dict[str, List[Dict[str, Any]]]:
|
||||||
"""Process and categorize annotations by feedback type."""
|
"""Process and categorize annotations by feedback type."""
|
||||||
results = {
|
results = {
|
||||||
"yes_pii": [],
|
"public_document": [],
|
||||||
"no_pii": [],
|
"private_document": [],
|
||||||
"cannot_read": [],
|
"cannot_read": [],
|
||||||
"disturbing": [],
|
"report_content": [],
|
||||||
"no_annotation": [],
|
"no_annotation": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
# Process each annotation
|
# Process each annotation
|
||||||
for annotations, link in annotations_by_link:
|
for annotations, link in annotations_by_link:
|
||||||
for page_id, annotation in annotations.items():
|
for page_id, annotation in annotations.items():
|
||||||
if not annotation or "feedbackOption" not in annotation:
|
if not annotation or "primaryOption" not in annotation:
|
||||||
results["no_annotation"].append(
|
results["no_annotation"].append(
|
||||||
{"page_id": page_id, "link": link, "pdf_path": annotation.get("pdfPath", "Unknown") if annotation else "Unknown"}
|
{"page_id": page_id, "link": link, "pdf_path": annotation.get("pdfPath", "Unknown") if annotation else "Unknown"}
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
category = annotation["feedbackOption"]
|
primary_option = annotation["primaryOption"]
|
||||||
result_item = {
|
pdf_path = annotation.get("pdfPath", "Unknown")
|
||||||
"page_id": page_id,
|
|
||||||
"link": link,
|
# Build a result item based on the new annotation structure
|
||||||
"pdf_path": annotation.get("pdfPath", "Unknown"),
|
if primary_option == "yes-public":
|
||||||
"description": annotation.get("piiDescription", ""),
|
# Public document with potential PII
|
||||||
}
|
public_pii_options = annotation.get("publicPiiOptions", [])
|
||||||
|
other_desc = annotation.get("otherPublicDesc", "")
|
||||||
|
|
||||||
|
if not public_pii_options:
|
||||||
|
# No PII selected in a public document
|
||||||
|
results["public_document"].append({
|
||||||
|
"page_id": page_id,
|
||||||
|
"link": link,
|
||||||
|
"pdf_path": pdf_path,
|
||||||
|
"pii_types": [],
|
||||||
|
"has_pii": False,
|
||||||
|
"description": ""
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# PII found in a public document
|
||||||
|
results["public_document"].append({
|
||||||
|
"page_id": page_id,
|
||||||
|
"link": link,
|
||||||
|
"pdf_path": pdf_path,
|
||||||
|
"pii_types": public_pii_options,
|
||||||
|
"has_pii": True,
|
||||||
|
"description": other_desc if "other" in public_pii_options else ""
|
||||||
|
})
|
||||||
|
|
||||||
|
elif primary_option == "no-public":
|
||||||
|
# Private document with potential PII
|
||||||
|
private_pii_options = annotation.get("privatePiiOptions", [])
|
||||||
|
other_desc = annotation.get("otherPrivateDesc", "")
|
||||||
|
|
||||||
|
if not private_pii_options:
|
||||||
|
# No PII selected in a private document
|
||||||
|
results["private_document"].append({
|
||||||
|
"page_id": page_id,
|
||||||
|
"link": link,
|
||||||
|
"pdf_path": pdf_path,
|
||||||
|
"pii_types": [],
|
||||||
|
"has_pii": False,
|
||||||
|
"description": ""
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# PII found in a private document
|
||||||
|
results["private_document"].append({
|
||||||
|
"page_id": page_id,
|
||||||
|
"link": link,
|
||||||
|
"pdf_path": pdf_path,
|
||||||
|
"pii_types": private_pii_options,
|
||||||
|
"has_pii": True,
|
||||||
|
"description": other_desc if "other" in private_pii_options else ""
|
||||||
|
})
|
||||||
|
|
||||||
|
elif primary_option == "cannot-read":
|
||||||
|
results["cannot_read"].append({
|
||||||
|
"page_id": page_id,
|
||||||
|
"link": link,
|
||||||
|
"pdf_path": pdf_path
|
||||||
|
})
|
||||||
|
|
||||||
|
elif primary_option == "report-content":
|
||||||
|
results["report_content"].append({
|
||||||
|
"page_id": page_id,
|
||||||
|
"link": link,
|
||||||
|
"pdf_path": pdf_path
|
||||||
|
})
|
||||||
|
|
||||||
if category == "yes-pii":
|
|
||||||
results["yes_pii"].append(result_item)
|
|
||||||
elif category == "no-pii":
|
|
||||||
results["no_pii"].append(result_item)
|
|
||||||
elif category == "cannot-read":
|
|
||||||
results["cannot_read"].append(result_item)
|
|
||||||
elif category == "disturbing":
|
|
||||||
results["disturbing"].append(result_item)
|
|
||||||
else:
|
else:
|
||||||
results["no_annotation"].append(result_item)
|
results["no_annotation"].append({
|
||||||
|
"page_id": page_id,
|
||||||
|
"link": link,
|
||||||
|
"pdf_path": pdf_path
|
||||||
|
})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@ -1025,23 +1309,74 @@ def print_annotation_report(annotation_results: Dict[str, List[Dict[str, Any]]])
|
|||||||
print(f"ANNOTATION REPORT - Total Pages: {total_pages}")
|
print(f"ANNOTATION REPORT - Total Pages: {total_pages}")
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Count pages with PII in public documents
|
||||||
|
public_with_pii = [page for page in annotation_results['public_document'] if page.get('has_pii', False)]
|
||||||
|
public_without_pii = [page for page in annotation_results['public_document'] if not page.get('has_pii', False)]
|
||||||
|
|
||||||
|
# Count pages with PII in private documents
|
||||||
|
private_with_pii = [page for page in annotation_results['private_document'] if page.get('has_pii', False)]
|
||||||
|
private_without_pii = [page for page in annotation_results['private_document'] if not page.get('has_pii', False)]
|
||||||
|
|
||||||
# Print summary statistics
|
# Print summary statistics
|
||||||
print("\nSummary:")
|
print("\nSummary:")
|
||||||
print(f" Pages with PII: {len(annotation_results['yes_pii'])} ({len(annotation_results['yes_pii'])/total_pages*100:.1f}%)")
|
print(f" Public documents (total): {len(annotation_results['public_document'])} ({len(annotation_results['public_document'])/total_pages*100:.1f}% of all pages)")
|
||||||
print(f" Pages without PII: {len(annotation_results['no_pii'])} ({len(annotation_results['no_pii'])/total_pages*100:.1f}%)")
|
print(f" - With PII: {len(public_with_pii)} ({len(public_with_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)")
|
||||||
|
print(f" - Without PII: {len(public_without_pii)} ({len(public_without_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)")
|
||||||
|
|
||||||
|
print(f" Private documents (total): {len(annotation_results['private_document'])} ({len(annotation_results['private_document'])/total_pages*100:.1f}% of all pages)")
|
||||||
|
print(f" - With PII: {len(private_with_pii)} ({len(private_with_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)")
|
||||||
|
print(f" - Without PII: {len(private_without_pii)} ({len(private_without_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)")
|
||||||
|
|
||||||
print(f" Unreadable pages: {len(annotation_results['cannot_read'])} ({len(annotation_results['cannot_read'])/total_pages*100:.1f}%)")
|
print(f" Unreadable pages: {len(annotation_results['cannot_read'])} ({len(annotation_results['cannot_read'])/total_pages*100:.1f}%)")
|
||||||
print(f" Pages with disturbing content: {len(annotation_results['disturbing'])} ({len(annotation_results['disturbing'])/total_pages*100:.1f}%)")
|
print(f" Pages with reported content: {len(annotation_results['report_content'])} ({len(annotation_results['report_content'])/total_pages*100:.1f}%)")
|
||||||
print(f" Pages without annotation: {len(annotation_results['no_annotation'])} ({len(annotation_results['no_annotation'])/total_pages*100:.1f}%)")
|
print(f" Pages without annotation: {len(annotation_results['no_annotation'])} ({len(annotation_results['no_annotation'])/total_pages*100:.1f}%)")
|
||||||
|
|
||||||
# Print detailed report for pages with PII
|
# Analyze PII types in public documents
|
||||||
if annotation_results["yes_pii"]:
|
if public_with_pii:
|
||||||
print("\nDetailed Report - Pages with PII:")
|
pii_counts_public = {}
|
||||||
|
for page in public_with_pii:
|
||||||
|
for pii_type in page.get('pii_types', []):
|
||||||
|
pii_counts_public[pii_type] = pii_counts_public.get(pii_type, 0) + 1
|
||||||
|
|
||||||
|
print("\nPII Types in Public Documents:")
|
||||||
|
for pii_type, count in sorted(pii_counts_public.items(), key=lambda x: x[1], reverse=True):
|
||||||
|
print(f" - {pii_type}: {count} ({count/len(public_with_pii)*100:.1f}%)")
|
||||||
|
|
||||||
|
# Analyze PII types in private documents
|
||||||
|
if private_with_pii:
|
||||||
|
pii_counts_private = {}
|
||||||
|
for page in private_with_pii:
|
||||||
|
for pii_type in page.get('pii_types', []):
|
||||||
|
pii_counts_private[pii_type] = pii_counts_private.get(pii_type, 0) + 1
|
||||||
|
|
||||||
|
print("\nPII Types in Private Documents:")
|
||||||
|
for pii_type, count in sorted(pii_counts_private.items(), key=lambda x: x[1], reverse=True):
|
||||||
|
print(f" - {pii_type}: {count} ({count/len(private_with_pii)*100:.1f}%)")
|
||||||
|
|
||||||
|
# Print detailed report for public documents with PII
|
||||||
|
if public_with_pii:
|
||||||
|
print("\nDetailed Report - Public Documents with PII:")
|
||||||
print("-" * 80)
|
print("-" * 80)
|
||||||
for i, item in enumerate(annotation_results["yes_pii"], 1):
|
for i, item in enumerate(public_with_pii, 1):
|
||||||
print(f"{i}. PDF: {item['pdf_path']}")
|
print(f"{i}. PDF: {item['pdf_path']}")
|
||||||
print(f" Page ID: {item['page_id']}")
|
print(f" Page ID: {item['page_id']}")
|
||||||
print(f" Link: {item['link']}#{item['page_id']}")
|
print(f" Link: {item['link']}#{item['page_id']}")
|
||||||
print(f" Description: {item['description']}")
|
print(f" PII Types: {', '.join(item['pii_types'])}")
|
||||||
|
if item.get('description'):
|
||||||
|
print(f" Description: {item['description']}")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
# Print detailed report for private documents with PII
|
||||||
|
if private_with_pii:
|
||||||
|
print("\nDetailed Report - Private Documents with PII:")
|
||||||
|
print("-" * 80)
|
||||||
|
for i, item in enumerate(private_with_pii, 1):
|
||||||
|
print(f"{i}. PDF: {item['pdf_path']}")
|
||||||
|
print(f" Page ID: {item['page_id']}")
|
||||||
|
print(f" Link: {item['link']}#{item['page_id']}")
|
||||||
|
print(f" PII Types: {', '.join(item['pii_types'])}")
|
||||||
|
if item.get('description'):
|
||||||
|
print(f" Description: {item['description']}")
|
||||||
print("-" * 80)
|
print("-" * 80)
|
||||||
|
|
||||||
print("\nReport complete.")
|
print("\nReport complete.")
|
||||||
@ -1084,11 +1419,32 @@ def read_and_process_results(args):
|
|||||||
|
|
||||||
with open(output_file, "w", newline="") as f:
|
with open(output_file, "w", newline="") as f:
|
||||||
writer = csv.writer(f)
|
writer = csv.writer(f)
|
||||||
writer.writerow(["Category", "PDF Path", "Page ID", "Link", "Description"])
|
writer.writerow(["Category", "PDF Path", "Page ID", "Link", "Document Type", "PII Types", "Description"])
|
||||||
|
|
||||||
for category, items in annotation_results.items():
|
for category, items in annotation_results.items():
|
||||||
for item in items:
|
for item in items:
|
||||||
writer.writerow([category, item["pdf_path"], item["page_id"], f"{item['link']}#{item['page_id']}", item.get("description", "")])
|
if category == "public_document":
|
||||||
|
doc_type = "Public"
|
||||||
|
pii_types = ", ".join(item.get("pii_types", []))
|
||||||
|
description = item.get("description", "")
|
||||||
|
elif category == "private_document":
|
||||||
|
doc_type = "Private"
|
||||||
|
pii_types = ", ".join(item.get("pii_types", []))
|
||||||
|
description = item.get("description", "")
|
||||||
|
else:
|
||||||
|
doc_type = ""
|
||||||
|
pii_types = ""
|
||||||
|
description = ""
|
||||||
|
|
||||||
|
writer.writerow([
|
||||||
|
category,
|
||||||
|
item["pdf_path"],
|
||||||
|
item["page_id"],
|
||||||
|
f"{item['link']}#{item['page_id']}",
|
||||||
|
doc_type,
|
||||||
|
pii_types,
|
||||||
|
description
|
||||||
|
])
|
||||||
|
|
||||||
print(f"Report saved to {output_file}")
|
print(f"Report saved to {output_file}")
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user