mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-11-04 03:56:16 +00:00 
			
		
		
		
	Workspace stuff
This commit is contained in:
		
							parent
							
								
									36d6228ffa
								
							
						
					
					
						commit
						569311c461
					
				@ -102,7 +102,19 @@ def parse_jsonl_entry(entry: Dict) -> Optional[Dict]:
 | 
			
		||||
            logger.warning(f"Entry for {source_file} missing pdf_page_numbers")
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        return {"id": entry.get("id", ""), "text": text, "source_file": source_file, "metadata": metadata, "pdf_page_numbers": pdf_page_numbers}
 | 
			
		||||
        # Extract PAGE_RESPONSE_COLUMNS data from attributes
 | 
			
		||||
        page_response_data = {}
 | 
			
		||||
        for column in PAGE_RESPONSE_COLUMNS:
 | 
			
		||||
            page_response_data[column] = attributes.get(column, [])
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            "id": entry.get("id", ""),
 | 
			
		||||
            "text": text,
 | 
			
		||||
            "source_file": source_file,
 | 
			
		||||
            "metadata": metadata,
 | 
			
		||||
            "pdf_page_numbers": pdf_page_numbers,
 | 
			
		||||
            "page_response_data": page_response_data
 | 
			
		||||
        }
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error parsing JSONL entry: {e}")
 | 
			
		||||
        return None
 | 
			
		||||
@ -178,6 +190,7 @@ def process_document(entry_data: Dict, output_dir: Path, cache_dir: Path) -> Tup
 | 
			
		||||
    doc_id = entry_data["id"]
 | 
			
		||||
    full_text = entry_data["text"]
 | 
			
		||||
    pdf_page_numbers = entry_data["pdf_page_numbers"]
 | 
			
		||||
    page_response_data = entry_data.get("page_response_data", {})
 | 
			
		||||
 | 
			
		||||
    # Extract page texts
 | 
			
		||||
    page_texts = extract_page_text(full_text, pdf_page_numbers)
 | 
			
		||||
@ -210,9 +223,17 @@ def process_document(entry_data: Dict, output_dir: Path, cache_dir: Path) -> Tup
 | 
			
		||||
 | 
			
		||||
    doc_dir.mkdir(parents=True, exist_ok=True)
 | 
			
		||||
 | 
			
		||||
    # Create a mapping from page numbers to their indices
 | 
			
		||||
    page_num_to_index = {}
 | 
			
		||||
    for idx, (_, _, page_num) in enumerate(pdf_page_numbers):
 | 
			
		||||
        page_num_to_index[page_num] = idx
 | 
			
		||||
 | 
			
		||||
    # Process each page
 | 
			
		||||
    for page_num, page_text in page_texts.items():
 | 
			
		||||
        try:
 | 
			
		||||
            # Get the index for this page number
 | 
			
		||||
            page_index = page_num_to_index.get(page_num)
 | 
			
		||||
 | 
			
		||||
            # Create filenames
 | 
			
		||||
            base_name = f"{doc_id}_page{page_num}"
 | 
			
		||||
            md_path = doc_dir / f"{base_name}.md"
 | 
			
		||||
@ -224,11 +245,16 @@ def process_document(entry_data: Dict, output_dir: Path, cache_dir: Path) -> Tup
 | 
			
		||||
                f.write("---\n")
 | 
			
		||||
 | 
			
		||||
                # Write PAGE_RESPONSE_COLUMNS fields in order
 | 
			
		||||
                metadata = entry_data.get("metadata", {})
 | 
			
		||||
                for column in PAGE_RESPONSE_COLUMNS:
 | 
			
		||||
                    # Check if field exists in metadata
 | 
			
		||||
                    if column in metadata:
 | 
			
		||||
                        value = metadata[column]
 | 
			
		||||
                    # Get the value for this page from the attributes lists
 | 
			
		||||
                    column_values = page_response_data.get(column, [])
 | 
			
		||||
 | 
			
		||||
                    if page_index is not None and page_index < len(column_values):
 | 
			
		||||
                        value = column_values[page_index]
 | 
			
		||||
                        # Convert None to null for YAML
 | 
			
		||||
                        if value is None:
 | 
			
		||||
                            f.write(f"{column}: null\n")
 | 
			
		||||
                        else:
 | 
			
		||||
                            f.write(f"{column}: {value}\n")
 | 
			
		||||
                    else:
 | 
			
		||||
                        # Write default values for missing fields
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user