mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-03 03:25:22 +00:00
Merge branch 'main' of https://github.com/allenai/olmocr into main
This commit is contained in:
commit
cf0d07d8d7
@ -243,6 +243,9 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
color: var(--text-color);
|
||||
background-color: var(--bg-color);
|
||||
padding: 2rem;
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
gap: 2rem;
|
||||
}}
|
||||
|
||||
ul {{
|
||||
@ -250,15 +253,23 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
}}
|
||||
|
||||
.container {{
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
flex: 2;
|
||||
max-width: 750px;
|
||||
}}
|
||||
|
||||
header {{
|
||||
margin-bottom: 2rem;
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
padding-bottom: 1rem;
|
||||
|
||||
position: sticky;
|
||||
top: 2rem;
|
||||
flex: 1;
|
||||
min-width: 380px;
|
||||
max-width: 420px;
|
||||
max-height: calc(100vh - 4rem);
|
||||
overflow-y: auto;
|
||||
padding: 1.5rem;
|
||||
background-color: white;
|
||||
border-radius: 0.5rem;
|
||||
box-shadow: var(--card-shadow);
|
||||
align-self: flex-start;
|
||||
font-size: small;
|
||||
}}
|
||||
|
||||
@ -296,7 +307,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
|
||||
.page-grid {{
|
||||
display: grid;
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
grid-template-columns: 1fr;
|
||||
gap: 2rem;
|
||||
}}
|
||||
|
||||
@ -377,6 +388,15 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
display: block; /* Show only the active annotation interface */
|
||||
}}
|
||||
|
||||
.question-container {{
|
||||
margin-bottom: 1rem;
|
||||
}}
|
||||
|
||||
.question-text {{
|
||||
font-weight: 500;
|
||||
margin-bottom: 0.5rem;
|
||||
}}
|
||||
|
||||
/* Button group styling for connected buttons */
|
||||
.btn-group {{
|
||||
display: inline-flex;
|
||||
@ -413,10 +433,50 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
color: white;
|
||||
}}
|
||||
|
||||
.checkbox-group {{
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 0.5rem;
|
||||
margin-bottom: 1rem;
|
||||
}}
|
||||
|
||||
.checkbox-group label {{
|
||||
display: flex;
|
||||
align-items: center;
|
||||
padding: 0.25rem 0.5rem;
|
||||
background-color: #f1f5f9;
|
||||
border-radius: 0.25rem;
|
||||
cursor: pointer;
|
||||
font-size: 0.875rem;
|
||||
}}
|
||||
|
||||
.checkbox-group label:hover {{
|
||||
background-color: #e2e8f0;
|
||||
}}
|
||||
|
||||
.checkbox-group input[type="checkbox"] {{
|
||||
margin-right: 0.5rem;
|
||||
}}
|
||||
|
||||
.continue-button {{
|
||||
padding: 0.5rem 1rem;
|
||||
background-color: var(--primary-color);
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 0.25rem;
|
||||
cursor: pointer;
|
||||
font-weight: 500;
|
||||
}}
|
||||
|
||||
.continue-button:hover {{
|
||||
background-color: #1d4ed8;
|
||||
}}
|
||||
|
||||
.annotation-interface textarea {{
|
||||
display: none; /* Hide textarea by default */
|
||||
width: 100%;
|
||||
margin-top: 0.5rem;
|
||||
margin-bottom: 1rem;
|
||||
padding: 0.5rem;
|
||||
font-size: 0.875rem;
|
||||
border: 1px solid var(--border-color);
|
||||
@ -494,53 +554,71 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
@media (max-width: 768px) {{
|
||||
body {{
|
||||
padding: 1rem;
|
||||
flex-direction: column;
|
||||
}}
|
||||
|
||||
.page-grid {{
|
||||
grid-template-columns: 1fr;
|
||||
header {{
|
||||
position: static;
|
||||
max-width: 100%;
|
||||
margin-left: 0;
|
||||
margin-bottom: 2rem;
|
||||
}}
|
||||
|
||||
.container {{
|
||||
max-width: 100%;
|
||||
}}
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<header>
|
||||
<header>
|
||||
<h2>Task Instructions</h2>
|
||||
<p>Your task is to review {len(random_pages)} document pages and determine whether they contain any <strong>Personally Identifiable Information (PII)</strong>. Carefully but efficiently inspect each page and select the appropriate response. You do not need to read every word - quickly scan the page and look for any obvious PII. The time expected to complete this task is 10-15 minutes.</p>
|
||||
|
||||
<h2>How to Annotate</h2>
|
||||
<p>The page you are currently annotating will be highlighted with a blue outline and a set of response buttons will be displayed directly below it.</p>
|
||||
<br/>
|
||||
<p><strong>Yes PII</strong> - Select this if you find any information on the page that qualifies as PII. A text box will appear below - briefly describe the kind of PII you encountered (e.g., full name, social security number, etc.) then press the Enter key.</p>
|
||||
<p><strong>No PII</strong> - Select this if the page does not contain any PII.</p>
|
||||
<p><strong>I cannot read this</strong> - Select this if you are unable to read the page for any reason (e.g., written in a language other than English, heavily redacted text, etc.)</p>
|
||||
<p><strong>Disturbing content</strong> - Select this if the page contains disturbing or graphic content.</p>
|
||||
|
||||
<br/>
|
||||
<p>You may edit your annotations any time before submitting. To do so, press the green Edit button directly above the page.</p>
|
||||
<p>After completing all the document pages on this screen, you will receive a Prolific completion code.</p>
|
||||
|
||||
<h2>What Counts as PII?</h2>
|
||||
<ul>
|
||||
<li><strong>Names</strong>: Full names, first names, last names, nicknames, maiden names, aliases</li>
|
||||
<li><strong>Addresses</strong>: Street addresses, postal codes, cities, states, countries</li>
|
||||
<li><strong>Contact Information</strong>: Phone numbers, email addresses</li>
|
||||
<li><strong>Government IDs</strong>: SSNs, passport numbers, driver's license numbers, tax IDs</li>
|
||||
<li><strong>Financial Information</strong>: Credit card numbers, bank account numbers, routing numbers</li>
|
||||
<li><strong>Biometric Data</strong>: Fingerprints, retina scans, facial recognition data, voice signatures</li>
|
||||
<li><strong>Personal Attributes</strong>: Date of birth, place of birth, gender, race, religion</li>
|
||||
<li><strong>Online Identifiers</strong>: IP addresses, login IDs, usernames, passwords, API keys, URLs</li>
|
||||
<li><strong>Location Information</strong>: Geolocations, specific coordinates</li>
|
||||
<li><strong>Employment Information</strong>: Job titles, workplace names, employment history</li>
|
||||
<li><strong>Education Information</strong>: School names, degrees, transcripts</li>
|
||||
<li><strong>Medical Information</strong>: Health records, diagnoses</li>
|
||||
<li><strong>Company Names</strong>: If they are tied to an individual's identity (e.g., a person's personal business)</li>
|
||||
</ul>
|
||||
|
||||
<h2>What NOT to Mark as PII</h2>
|
||||
<p><strong>Author names, researcher names, citations, or references from published research papers</strong> should NOT be marked as PII. These names are part of the normal publication process and are not considered private or sensitive information for the purposes of this task.
|
||||
Only mark information as PII if it relates to private, sensitive, or personal details about an individual outside the context of the publication.</p>
|
||||
</header>
|
||||
<p>Your task is to review {len(random_pages)} document pages and determine whether they contain any <strong>Personally Identifiable Information (PII)</strong>. Carefully but efficiently inspect each page and select the appropriate response. You do not need to read every word - quickly scan the page and look for any obvious PII. The time expected to complete this task is 10-15 minutes.</p>
|
||||
|
||||
<h2>How to Annotate</h2>
|
||||
<p>The page you are currently annotating will be highlighted with a blue outline and a set of questions will be displayed directly below it.</p>
|
||||
<br/>
|
||||
<p><strong>First question:</strong> Is this document meant for public dissemination?</p>
|
||||
<ul>
|
||||
<li><strong>Yes</strong> - If the document appears to be a publication, research paper, public information, etc.</li>
|
||||
<li><strong>No</strong> - If the document appears to be private, personal, or not intended for public release</li>
|
||||
<li><strong>I cannot read it</strong> - If you are unable to read the page (e.g., foreign language, poor quality)</li>
|
||||
<li><strong>Report Content</strong> - If the content is inappropriate or disturbing</li>
|
||||
</ul>
|
||||
|
||||
<p><strong>Second question:</strong> Depending on your first answer, you'll be asked to identify any PII in the document:</p>
|
||||
<ul>
|
||||
<li>For <strong>public</strong> documents, select from: SSN, Bank Info, Credit Card Info, Usernames/Passwords, Other</li>
|
||||
<li>For <strong>private</strong> documents, select from: Full Names, Addresses, Contact Info, Personal Attributes, SSN, Bank Info, Credit Card Info, Usernames/Passwords, Other</li>
|
||||
</ul>
|
||||
<p>You can select multiple PII types. If you select "Other", a text box will appear where you can describe the PII.</p>
|
||||
|
||||
<br/>
|
||||
<p>You may edit your annotations any time before submitting. To do so, press the green Edit button directly above the page.</p>
|
||||
<p>After completing all the document pages on this screen, you will receive a Prolific completion code.</p>
|
||||
|
||||
<h2>What Counts as PII?</h2>
|
||||
<ul>
|
||||
<li><strong>Names</strong>: Full names, first names, last names, nicknames, maiden names, aliases</li>
|
||||
<li><strong>Addresses</strong>: Street addresses, postal codes, cities, states, countries</li>
|
||||
<li><strong>Contact Information</strong>: Phone numbers, email addresses</li>
|
||||
<li><strong>Government IDs</strong>: SSNs, passport numbers, driver's license numbers, tax IDs</li>
|
||||
<li><strong>Financial Information</strong>: Credit card numbers, bank account numbers, routing numbers</li>
|
||||
<li><strong>Biometric Data</strong>: Fingerprints, retina scans, facial recognition data, voice signatures</li>
|
||||
<li><strong>Personal Attributes</strong>: Date of birth, place of birth, gender, race, religion</li>
|
||||
<li><strong>Online Identifiers</strong>: IP addresses, login IDs, usernames, passwords, API keys, URLs</li>
|
||||
<li><strong>Location Information</strong>: Geolocations, specific coordinates</li>
|
||||
<li><strong>Employment Information</strong>: Job titles, workplace names, employment history</li>
|
||||
<li><strong>Education Information</strong>: School names, degrees, transcripts</li>
|
||||
<li><strong>Medical Information</strong>: Health records, diagnoses</li>
|
||||
<li><strong>Company Names</strong>: If they are tied to an individual's identity (e.g., a person's personal business)</li>
|
||||
</ul>
|
||||
|
||||
<h2>What NOT to Mark as PII</h2>
|
||||
<p><strong>Author names, researcher names, citations, or references from published research papers</strong> should NOT be marked as PII. These names are part of the normal publication process and are not considered private or sensitive information for the purposes of this task.
|
||||
Only mark information as PII if it relates to private, sensitive, or personal details about an individual outside the context of the publication.</p>
|
||||
</header>
|
||||
<div class="container">
|
||||
|
||||
<div class="info-bar">
|
||||
<div class="info-item">
|
||||
@ -591,7 +669,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
html_content += f"""
|
||||
<div class="page-container" data-index="{i}">
|
||||
<div class="page-info">
|
||||
<h2 title="{pdf_path}"><a href="{original_url}" target="_blank">{original_url}</a></h2>
|
||||
<h2 title="{pdf_path}">{original_url}</h2>
|
||||
<p>Page {page_num}</p>
|
||||
<p>{f'<a href="{presigned_url}" target="_blank">View Cached PDF</a>' if presigned_url else pdf_path}</p>
|
||||
<p>
|
||||
@ -602,13 +680,45 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
<img class="page-image" src="data:image/webp;base64,{base64_image}" alt="PDF Page {page_num}" loading="lazy" />
|
||||
</div>
|
||||
<div class="annotation-interface{active_class}" data-id="page-{i}" data-pdf-path="{pdf_path}">
|
||||
<span class="btn-group">
|
||||
<button type="button" class="toggle-button feedback-option" data-value="yes-pii" onclick="toggleFeedbackOption(this)">Yes PII</button>
|
||||
<button type="button" class="toggle-button feedback-option" data-value="no-pii" onclick="toggleFeedbackOption(this)">No PII</button>
|
||||
<button type="button" class="toggle-button feedback-option" data-value="cannot-read" onclick="toggleFeedbackOption(this)">I cannot read this</button>
|
||||
<button type="button" class="toggle-button feedback-option" data-value="disturbing" onclick="toggleFeedbackOption(this)">Disturbing content</button>
|
||||
</span>
|
||||
<textarea placeholder="Describe any private PII in the document" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||
<div class="question-container" id="question1-{i}">
|
||||
<p class="question-text">Is this document meant for public dissemination?</p>
|
||||
<span class="btn-group">
|
||||
<button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
|
||||
<button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
|
||||
<button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">I cannot read it</button>
|
||||
<button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<div class="question-container" id="public-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
||||
<p class="question-text">Select any PII found in this public document:</p>
|
||||
<div class="checkbox-group">
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||
</div>
|
||||
<textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
||||
</div>
|
||||
|
||||
<div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
||||
<p class="question-text">Select any PII found in this private document:</p>
|
||||
<div class="checkbox-group">
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="full-names" onchange="saveCheckboxes(this)"> Full Names</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="addresses" onchange="saveCheckboxes(this)"> Addresses</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="contact-info" onchange="saveCheckboxes(this)"> Contact Info</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="personal-attributes" onchange="saveCheckboxes(this)"> Personal Attributes</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||
</div>
|
||||
<textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
@ -623,7 +733,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
html_content += f"""
|
||||
<div class="page-container" data-index="{i}">
|
||||
<div class="page-info">
|
||||
<h2 title="{pdf_path}"><a href="{original_url}" target="_blank">{original_url}</a></h2>
|
||||
<h2 title="{pdf_path}">original_url</h2>
|
||||
<p>Page {page_num}</p>
|
||||
<p>{f'<a href="{presigned_url}" target="_blank">View Cached PDF</a>' if presigned_url else pdf_path}</p>
|
||||
<p>
|
||||
@ -632,13 +742,45 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
</div>
|
||||
<div class="error">Error: {str(e)}</div>
|
||||
<div class="annotation-interface{active_class}" data-id="page-{i}" data-pdf-path="{pdf_path}">
|
||||
<span class="btn-group">
|
||||
<button type="button" class="toggle-button feedback-option" data-value="yes-pii" onclick="toggleFeedbackOption(this)">Yes PII</button>
|
||||
<button type="button" class="toggle-button feedback-option" data-value="no-pii" onclick="toggleFeedbackOption(this)">No PII</button>
|
||||
<button type="button" class="toggle-button feedback-option" data-value="cannot-read" onclick="toggleFeedbackOption(this)">I cannot read this</button>
|
||||
<button type="button" class="toggle-button feedback-option" data-value="disturbing" onclick="toggleFeedbackOption(this)">Disturbing content</button>
|
||||
</span>
|
||||
<textarea placeholder="Describe any private PII in the document" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||
<div class="question-container" id="question1-{i}">
|
||||
<p class="question-text">Is this document meant for public dissemination?</p>
|
||||
<span class="btn-group">
|
||||
<button type="button" class="toggle-button primary-option" data-value="yes-public" onclick="togglePrimaryOption(this, {i})">Yes</button>
|
||||
<button type="button" class="toggle-button primary-option" data-value="no-public" onclick="togglePrimaryOption(this, {i})">No</button>
|
||||
<button type="button" class="toggle-button primary-option" data-value="cannot-read" onclick="togglePrimaryOption(this, {i})">I cannot read it</button>
|
||||
<button type="button" class="toggle-button primary-option" data-value="report-content" onclick="togglePrimaryOption(this, {i})">Report Content</button>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<div class="question-container" id="public-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
||||
<p class="question-text">Select any PII found in this public document:</p>
|
||||
<div class="checkbox-group">
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||
</div>
|
||||
<textarea id="other-pii-public-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
||||
</div>
|
||||
|
||||
<div class="question-container" id="private-pii-options-{i}" style="display: none; margin-top: 1rem;">
|
||||
<p class="question-text">Select any PII found in this private document:</p>
|
||||
<div class="checkbox-group">
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="full-names" onchange="saveCheckboxes(this)"> Full Names</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="addresses" onchange="saveCheckboxes(this)"> Addresses</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="contact-info" onchange="saveCheckboxes(this)"> Contact Info</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="personal-attributes" onchange="saveCheckboxes(this)"> Personal Attributes</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="ssn" onchange="saveCheckboxes(this)"> SSN</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="bank-info" onchange="saveCheckboxes(this)"> Bank Info</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="credit-card" onchange="saveCheckboxes(this)"> Credit Card Info</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="usernames-passwords" onchange="saveCheckboxes(this)"> Usernames/Passwords</label>
|
||||
<label><input type="checkbox" class="pii-checkbox" data-value="other" onchange="toggleOtherTextarea(this)"> Other</label>
|
||||
</div>
|
||||
<textarea id="other-pii-private-{i}" placeholder="Describe other PII found in the document" style="display: none;" onchange="saveFeedback(this)" onkeydown="handleTextareaKeydown(event, this)"></textarea>
|
||||
<button type="button" class="continue-button" onclick="goToNextDocument()">Continue</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
@ -777,45 +919,88 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
async function saveFeedback(source) {
|
||||
const interfaceDiv = source.closest('.annotation-interface');
|
||||
const id = interfaceDiv.getAttribute('data-id');
|
||||
// Get the selected feedback option value
|
||||
const activeButton = interfaceDiv.querySelector('button.feedback-option.active');
|
||||
const feedbackOption = activeButton ? activeButton.getAttribute('data-value') : null;
|
||||
const piiDescription = interfaceDiv.querySelector('textarea').value;
|
||||
|
||||
// Get the selected primary option
|
||||
const activePrimaryButton = interfaceDiv.querySelector('button.primary-option.active');
|
||||
const primaryOption = activePrimaryButton ? activePrimaryButton.getAttribute('data-value') : null;
|
||||
|
||||
// Get checkbox selections for public document
|
||||
const publicPiiOptions = [];
|
||||
interfaceDiv.querySelectorAll('#public-pii-options-' + id.split('-')[1] + ' input[type="checkbox"]:checked').forEach(checkbox => {
|
||||
publicPiiOptions.push(checkbox.getAttribute('data-value'));
|
||||
});
|
||||
|
||||
// Get checkbox selections for private document
|
||||
const privatePiiOptions = [];
|
||||
interfaceDiv.querySelectorAll('#private-pii-options-' + id.split('-')[1] + ' input[type="checkbox"]:checked').forEach(checkbox => {
|
||||
privatePiiOptions.push(checkbox.getAttribute('data-value'));
|
||||
});
|
||||
|
||||
// Get any "Other" descriptions
|
||||
const otherPublicDesc = interfaceDiv.querySelector('#other-pii-public-' + id.split('-')[1])?.value || '';
|
||||
const otherPrivateDesc = interfaceDiv.querySelector('#other-pii-private-' + id.split('-')[1])?.value || '';
|
||||
|
||||
const pdfPath = interfaceDiv.getAttribute('data-pdf-path');
|
||||
|
||||
const datastore = await fetchDatastore() || {};
|
||||
datastore[id] = {
|
||||
feedbackOption: feedbackOption,
|
||||
piiDescription: piiDescription,
|
||||
primaryOption: primaryOption,
|
||||
publicPiiOptions: publicPiiOptions,
|
||||
privatePiiOptions: privatePiiOptions,
|
||||
otherPublicDesc: otherPublicDesc,
|
||||
otherPrivateDesc: otherPrivateDesc,
|
||||
pdfPath: pdfPath
|
||||
};
|
||||
|
||||
await putDatastore(datastore);
|
||||
}
|
||||
|
||||
function toggleFeedbackOption(btn) {
|
||||
function togglePrimaryOption(btn, index) {
|
||||
const interfaceDiv = btn.closest('.annotation-interface');
|
||||
// Remove active class from all feedback option buttons in this group
|
||||
interfaceDiv.querySelectorAll('button.feedback-option').forEach(function(b) {
|
||||
// Remove active class from all primary option buttons in this group
|
||||
interfaceDiv.querySelectorAll('button.primary-option').forEach(function(b) {
|
||||
b.classList.remove('active');
|
||||
});
|
||||
|
||||
// Toggle on the clicked button
|
||||
btn.classList.add('active');
|
||||
saveFeedback(interfaceDiv);
|
||||
|
||||
// Show or hide textarea based on selected option
|
||||
const textarea = interfaceDiv.querySelector('textarea');
|
||||
const feedbackOption = btn.getAttribute('data-value');
|
||||
// Hide all secondary option containers
|
||||
document.querySelector(`#public-pii-options-${index}`).style.display = 'none';
|
||||
document.querySelector(`#private-pii-options-${index}`).style.display = 'none';
|
||||
|
||||
if (feedbackOption === 'yes-pii') {
|
||||
// Only show textarea if "Yes PII" is selected
|
||||
const option = btn.getAttribute('data-value');
|
||||
|
||||
// Show the appropriate secondary options based on the selected primary option
|
||||
if (option === 'yes-public') {
|
||||
document.querySelector(`#public-pii-options-${index}`).style.display = 'block';
|
||||
} else if (option === 'no-public') {
|
||||
document.querySelector(`#private-pii-options-${index}`).style.display = 'block';
|
||||
} else {
|
||||
// For "cannot-read" or "report-content", just save and move to next
|
||||
saveFeedback(interfaceDiv);
|
||||
goToNextDocument();
|
||||
}
|
||||
}
|
||||
|
||||
function toggleOtherTextarea(checkbox) {
|
||||
const container = checkbox.closest('.question-container');
|
||||
const textareaId = container.querySelector('textarea').id;
|
||||
const textarea = document.getElementById(textareaId);
|
||||
|
||||
if (checkbox.checked) {
|
||||
textarea.style.display = 'block';
|
||||
textarea.focus();
|
||||
} else {
|
||||
// If other options selected, hide textarea and go to next
|
||||
textarea.style.display = 'none';
|
||||
goToNextDocument();
|
||||
}
|
||||
|
||||
saveCheckboxes(checkbox);
|
||||
}
|
||||
|
||||
function saveCheckboxes(input) {
|
||||
const interfaceDiv = input.closest('.annotation-interface');
|
||||
saveFeedback(interfaceDiv);
|
||||
}
|
||||
|
||||
// Function to deobfuscate the Prolific code
|
||||
@ -849,23 +1034,64 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
|
||||
document.querySelectorAll('.annotation-interface').forEach(function(interfaceDiv) {
|
||||
const id = interfaceDiv.getAttribute('data-id');
|
||||
const pageIndex = id.split('-')[1];
|
||||
|
||||
if (datastore[id]) {
|
||||
const data = datastore[id];
|
||||
// Set active state for feedback option buttons
|
||||
interfaceDiv.querySelectorAll('button.feedback-option').forEach(function(btn) {
|
||||
if (btn.getAttribute('data-value') === data.feedbackOption) {
|
||||
|
||||
// Set active state for primary option buttons
|
||||
interfaceDiv.querySelectorAll('button.primary-option').forEach(function(btn) {
|
||||
if (btn.getAttribute('data-value') === data.primaryOption) {
|
||||
btn.classList.add('active');
|
||||
|
||||
// Show textarea if "Yes PII" is selected
|
||||
if (btn.getAttribute('data-value') === 'yes-pii') {
|
||||
interfaceDiv.querySelector('textarea').style.display = 'block';
|
||||
// Show the appropriate secondary options
|
||||
const option = btn.getAttribute('data-value');
|
||||
if (option === 'yes-public') {
|
||||
document.querySelector(`#public-pii-options-${pageIndex}`).style.display = 'block';
|
||||
} else if (option === 'no-public') {
|
||||
document.querySelector(`#private-pii-options-${pageIndex}`).style.display = 'block';
|
||||
}
|
||||
} else {
|
||||
btn.classList.remove('active');
|
||||
}
|
||||
});
|
||||
// Set the textarea value
|
||||
interfaceDiv.querySelector('textarea').value = data.piiDescription;
|
||||
|
||||
// Restore public PII checkboxes
|
||||
if (data.publicPiiOptions && data.publicPiiOptions.length > 0) {
|
||||
const publicContainer = document.querySelector(`#public-pii-options-${pageIndex}`);
|
||||
data.publicPiiOptions.forEach(option => {
|
||||
const checkbox = publicContainer.querySelector(`input[data-value="${option}"]`);
|
||||
if (checkbox) {
|
||||
checkbox.checked = true;
|
||||
if (option === 'other') {
|
||||
document.getElementById(`other-pii-public-${pageIndex}`).style.display = 'block';
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Restore private PII checkboxes
|
||||
if (data.privatePiiOptions && data.privatePiiOptions.length > 0) {
|
||||
const privateContainer = document.querySelector(`#private-pii-options-${pageIndex}`);
|
||||
data.privatePiiOptions.forEach(option => {
|
||||
const checkbox = privateContainer.querySelector(`input[data-value="${option}"]`);
|
||||
if (checkbox) {
|
||||
checkbox.checked = true;
|
||||
if (option === 'other') {
|
||||
document.getElementById(`other-pii-private-${pageIndex}`).style.display = 'block';
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Set the textarea values
|
||||
if (data.otherPublicDesc) {
|
||||
document.getElementById(`other-pii-public-${pageIndex}`).value = data.otherPublicDesc;
|
||||
}
|
||||
|
||||
if (data.otherPrivateDesc) {
|
||||
document.getElementById(`other-pii-private-${pageIndex}`).value = data.otherPrivateDesc;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@ -873,7 +1099,7 @@ def create_html_output(random_pages, pdf_s3_client, output_path, workspace_path,
|
||||
let lastAnnotatedIndex = -1;
|
||||
for (let i = 0; i < totalPages; i++) {
|
||||
const pageId = `page-${i}`;
|
||||
if (datastore[pageId] && datastore[pageId].feedbackOption) {
|
||||
if (datastore[pageId] && datastore[pageId].primaryOption) {
|
||||
lastAnnotatedIndex = i;
|
||||
}
|
||||
}
|
||||
@ -979,40 +1205,98 @@ def fetch_annotations(tinyhost_link: str) -> Tuple[Dict[str, Any], str]:
|
||||
def process_annotations(annotations_by_link: List[Tuple[Dict[str, Any], str]]) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""Process and categorize annotations by feedback type."""
|
||||
results = {
|
||||
"yes_pii": [],
|
||||
"no_pii": [],
|
||||
"public_document": [],
|
||||
"private_document": [],
|
||||
"cannot_read": [],
|
||||
"disturbing": [],
|
||||
"report_content": [],
|
||||
"no_annotation": [],
|
||||
}
|
||||
|
||||
# Process each annotation
|
||||
for annotations, link in annotations_by_link:
|
||||
for page_id, annotation in annotations.items():
|
||||
if not annotation or "feedbackOption" not in annotation:
|
||||
if not annotation or "primaryOption" not in annotation:
|
||||
results["no_annotation"].append(
|
||||
{"page_id": page_id, "link": link, "pdf_path": annotation.get("pdfPath", "Unknown") if annotation else "Unknown"}
|
||||
)
|
||||
continue
|
||||
|
||||
category = annotation["feedbackOption"]
|
||||
result_item = {
|
||||
"page_id": page_id,
|
||||
"link": link,
|
||||
"pdf_path": annotation.get("pdfPath", "Unknown"),
|
||||
"description": annotation.get("piiDescription", ""),
|
||||
}
|
||||
|
||||
if category == "yes-pii":
|
||||
results["yes_pii"].append(result_item)
|
||||
elif category == "no-pii":
|
||||
results["no_pii"].append(result_item)
|
||||
elif category == "cannot-read":
|
||||
results["cannot_read"].append(result_item)
|
||||
elif category == "disturbing":
|
||||
results["disturbing"].append(result_item)
|
||||
primary_option = annotation["primaryOption"]
|
||||
pdf_path = annotation.get("pdfPath", "Unknown")
|
||||
|
||||
# Build a result item based on the new annotation structure
|
||||
if primary_option == "yes-public":
|
||||
# Public document with potential PII
|
||||
public_pii_options = annotation.get("publicPiiOptions", [])
|
||||
other_desc = annotation.get("otherPublicDesc", "")
|
||||
|
||||
if not public_pii_options:
|
||||
# No PII selected in a public document
|
||||
results["public_document"].append({
|
||||
"page_id": page_id,
|
||||
"link": link,
|
||||
"pdf_path": pdf_path,
|
||||
"pii_types": [],
|
||||
"has_pii": False,
|
||||
"description": ""
|
||||
})
|
||||
else:
|
||||
# PII found in a public document
|
||||
results["public_document"].append({
|
||||
"page_id": page_id,
|
||||
"link": link,
|
||||
"pdf_path": pdf_path,
|
||||
"pii_types": public_pii_options,
|
||||
"has_pii": True,
|
||||
"description": other_desc if "other" in public_pii_options else ""
|
||||
})
|
||||
|
||||
elif primary_option == "no-public":
|
||||
# Private document with potential PII
|
||||
private_pii_options = annotation.get("privatePiiOptions", [])
|
||||
other_desc = annotation.get("otherPrivateDesc", "")
|
||||
|
||||
if not private_pii_options:
|
||||
# No PII selected in a private document
|
||||
results["private_document"].append({
|
||||
"page_id": page_id,
|
||||
"link": link,
|
||||
"pdf_path": pdf_path,
|
||||
"pii_types": [],
|
||||
"has_pii": False,
|
||||
"description": ""
|
||||
})
|
||||
else:
|
||||
# PII found in a private document
|
||||
results["private_document"].append({
|
||||
"page_id": page_id,
|
||||
"link": link,
|
||||
"pdf_path": pdf_path,
|
||||
"pii_types": private_pii_options,
|
||||
"has_pii": True,
|
||||
"description": other_desc if "other" in private_pii_options else ""
|
||||
})
|
||||
|
||||
elif primary_option == "cannot-read":
|
||||
results["cannot_read"].append({
|
||||
"page_id": page_id,
|
||||
"link": link,
|
||||
"pdf_path": pdf_path
|
||||
})
|
||||
|
||||
elif primary_option == "report-content":
|
||||
results["report_content"].append({
|
||||
"page_id": page_id,
|
||||
"link": link,
|
||||
"pdf_path": pdf_path
|
||||
})
|
||||
|
||||
else:
|
||||
results["no_annotation"].append(result_item)
|
||||
results["no_annotation"].append({
|
||||
"page_id": page_id,
|
||||
"link": link,
|
||||
"pdf_path": pdf_path
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
@ -1025,23 +1309,74 @@ def print_annotation_report(annotation_results: Dict[str, List[Dict[str, Any]]])
|
||||
print(f"ANNOTATION REPORT - Total Pages: {total_pages}")
|
||||
print("=" * 80)
|
||||
|
||||
# Count pages with PII in public documents
|
||||
public_with_pii = [page for page in annotation_results['public_document'] if page.get('has_pii', False)]
|
||||
public_without_pii = [page for page in annotation_results['public_document'] if not page.get('has_pii', False)]
|
||||
|
||||
# Count pages with PII in private documents
|
||||
private_with_pii = [page for page in annotation_results['private_document'] if page.get('has_pii', False)]
|
||||
private_without_pii = [page for page in annotation_results['private_document'] if not page.get('has_pii', False)]
|
||||
|
||||
# Print summary statistics
|
||||
print("\nSummary:")
|
||||
print(f" Pages with PII: {len(annotation_results['yes_pii'])} ({len(annotation_results['yes_pii'])/total_pages*100:.1f}%)")
|
||||
print(f" Pages without PII: {len(annotation_results['no_pii'])} ({len(annotation_results['no_pii'])/total_pages*100:.1f}%)")
|
||||
print(f" Public documents (total): {len(annotation_results['public_document'])} ({len(annotation_results['public_document'])/total_pages*100:.1f}% of all pages)")
|
||||
print(f" - With PII: {len(public_with_pii)} ({len(public_with_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)")
|
||||
print(f" - Without PII: {len(public_without_pii)} ({len(public_without_pii)/max(1, len(annotation_results['public_document']))*100:.1f}% of public docs)")
|
||||
|
||||
print(f" Private documents (total): {len(annotation_results['private_document'])} ({len(annotation_results['private_document'])/total_pages*100:.1f}% of all pages)")
|
||||
print(f" - With PII: {len(private_with_pii)} ({len(private_with_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)")
|
||||
print(f" - Without PII: {len(private_without_pii)} ({len(private_without_pii)/max(1, len(annotation_results['private_document']))*100:.1f}% of private docs)")
|
||||
|
||||
print(f" Unreadable pages: {len(annotation_results['cannot_read'])} ({len(annotation_results['cannot_read'])/total_pages*100:.1f}%)")
|
||||
print(f" Pages with disturbing content: {len(annotation_results['disturbing'])} ({len(annotation_results['disturbing'])/total_pages*100:.1f}%)")
|
||||
print(f" Pages with reported content: {len(annotation_results['report_content'])} ({len(annotation_results['report_content'])/total_pages*100:.1f}%)")
|
||||
print(f" Pages without annotation: {len(annotation_results['no_annotation'])} ({len(annotation_results['no_annotation'])/total_pages*100:.1f}%)")
|
||||
|
||||
# Print detailed report for pages with PII
|
||||
if annotation_results["yes_pii"]:
|
||||
print("\nDetailed Report - Pages with PII:")
|
||||
# Analyze PII types in public documents
|
||||
if public_with_pii:
|
||||
pii_counts_public = {}
|
||||
for page in public_with_pii:
|
||||
for pii_type in page.get('pii_types', []):
|
||||
pii_counts_public[pii_type] = pii_counts_public.get(pii_type, 0) + 1
|
||||
|
||||
print("\nPII Types in Public Documents:")
|
||||
for pii_type, count in sorted(pii_counts_public.items(), key=lambda x: x[1], reverse=True):
|
||||
print(f" - {pii_type}: {count} ({count/len(public_with_pii)*100:.1f}%)")
|
||||
|
||||
# Analyze PII types in private documents
|
||||
if private_with_pii:
|
||||
pii_counts_private = {}
|
||||
for page in private_with_pii:
|
||||
for pii_type in page.get('pii_types', []):
|
||||
pii_counts_private[pii_type] = pii_counts_private.get(pii_type, 0) + 1
|
||||
|
||||
print("\nPII Types in Private Documents:")
|
||||
for pii_type, count in sorted(pii_counts_private.items(), key=lambda x: x[1], reverse=True):
|
||||
print(f" - {pii_type}: {count} ({count/len(private_with_pii)*100:.1f}%)")
|
||||
|
||||
# Print detailed report for public documents with PII
|
||||
if public_with_pii:
|
||||
print("\nDetailed Report - Public Documents with PII:")
|
||||
print("-" * 80)
|
||||
for i, item in enumerate(annotation_results["yes_pii"], 1):
|
||||
for i, item in enumerate(public_with_pii, 1):
|
||||
print(f"{i}. PDF: {item['pdf_path']}")
|
||||
print(f" Page ID: {item['page_id']}")
|
||||
print(f" Link: {item['link']}#{item['page_id']}")
|
||||
print(f" Description: {item['description']}")
|
||||
print(f" PII Types: {', '.join(item['pii_types'])}")
|
||||
if item.get('description'):
|
||||
print(f" Description: {item['description']}")
|
||||
print("-" * 80)
|
||||
|
||||
# Print detailed report for private documents with PII
|
||||
if private_with_pii:
|
||||
print("\nDetailed Report - Private Documents with PII:")
|
||||
print("-" * 80)
|
||||
for i, item in enumerate(private_with_pii, 1):
|
||||
print(f"{i}. PDF: {item['pdf_path']}")
|
||||
print(f" Page ID: {item['page_id']}")
|
||||
print(f" Link: {item['link']}#{item['page_id']}")
|
||||
print(f" PII Types: {', '.join(item['pii_types'])}")
|
||||
if item.get('description'):
|
||||
print(f" Description: {item['description']}")
|
||||
print("-" * 80)
|
||||
|
||||
print("\nReport complete.")
|
||||
@ -1084,11 +1419,32 @@ def read_and_process_results(args):
|
||||
|
||||
with open(output_file, "w", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["Category", "PDF Path", "Page ID", "Link", "Description"])
|
||||
writer.writerow(["Category", "PDF Path", "Page ID", "Link", "Document Type", "PII Types", "Description"])
|
||||
|
||||
for category, items in annotation_results.items():
|
||||
for item in items:
|
||||
writer.writerow([category, item["pdf_path"], item["page_id"], f"{item['link']}#{item['page_id']}", item.get("description", "")])
|
||||
if category == "public_document":
|
||||
doc_type = "Public"
|
||||
pii_types = ", ".join(item.get("pii_types", []))
|
||||
description = item.get("description", "")
|
||||
elif category == "private_document":
|
||||
doc_type = "Private"
|
||||
pii_types = ", ".join(item.get("pii_types", []))
|
||||
description = item.get("description", "")
|
||||
else:
|
||||
doc_type = ""
|
||||
pii_types = ""
|
||||
description = ""
|
||||
|
||||
writer.writerow([
|
||||
category,
|
||||
item["pdf_path"],
|
||||
item["page_id"],
|
||||
f"{item['link']}#{item['page_id']}",
|
||||
doc_type,
|
||||
pii_types,
|
||||
description
|
||||
])
|
||||
|
||||
print(f"Report saved to {output_file}")
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user