mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-11-04 03:56:16 +00:00 
			
		
		
		
	Script fixups
This commit is contained in:
		
							parent
							
								
									505e08cbb1
								
							
						
					
					
						commit
						11e89dcd22
					
				@ -9,14 +9,17 @@ import syntok.tokenizer as tokenizer
 | 
			
		||||
def parse_sentences(text: str) -> list[str]:
 | 
			
		||||
    """
 | 
			
		||||
    Splits a text into a list of sentence strings using syntok.
 | 
			
		||||
    Preserves original spacing and punctuation.
 | 
			
		||||
    """
 | 
			
		||||
    sentences = []
 | 
			
		||||
    for paragraph in segmenter.process(text):
 | 
			
		||||
        for sentence in paragraph:
 | 
			
		||||
            # Collect token values, stripping out empty strings
 | 
			
		||||
            token_values = [token.value for token in sentence if token.value.strip()]
 | 
			
		||||
            # Join them with a space
 | 
			
		||||
            sentence_str = " ".join(token_values)
 | 
			
		||||
            # Reconstruct the sentence with original spacing
 | 
			
		||||
            sentence_str = ""
 | 
			
		||||
            for token in sentence:
 | 
			
		||||
                sentence_str += token.spacing + token.value
 | 
			
		||||
            # Trim any leading whitespace
 | 
			
		||||
            sentence_str = sentence_str.lstrip()
 | 
			
		||||
            sentences.append(sentence_str)
 | 
			
		||||
    return sentences
 | 
			
		||||
 | 
			
		||||
@ -26,6 +29,8 @@ def compare_votes_for_file(base_text: str, candidate_texts: list[str]) -> None:
 | 
			
		||||
    each candidate text (using a similarity threshold). If any candidate sentences
 | 
			
		||||
    differ from the base sentence, prints the base sentence along with each unique
 | 
			
		||||
    variant and the number of times it was chosen.
 | 
			
		||||
    
 | 
			
		||||
    Comparison is case-insensitive, but output preserves original capitalization.
 | 
			
		||||
    """
 | 
			
		||||
    base_sentences = parse_sentences(base_text)
 | 
			
		||||
    # Parse all candidate texts into lists of sentences
 | 
			
		||||
@ -38,18 +43,19 @@ def compare_votes_for_file(base_text: str, candidate_texts: list[str]) -> None:
 | 
			
		||||
            best_candidate = None
 | 
			
		||||
 | 
			
		||||
            # Find the candidate sentence with the highest similarity to b_sentence
 | 
			
		||||
            # using case-insensitive comparison
 | 
			
		||||
            for c_sentence in c_sentences:
 | 
			
		||||
                ratio = SequenceMatcher(None, b_sentence, c_sentence).ratio()
 | 
			
		||||
                ratio = SequenceMatcher(None, b_sentence.lower(), c_sentence.lower()).ratio()
 | 
			
		||||
                if ratio > best_ratio:
 | 
			
		||||
                    best_ratio = ratio
 | 
			
		||||
                    best_candidate = c_sentence
 | 
			
		||||
                    best_candidate = c_sentence  # Keep original capitalization for output
 | 
			
		||||
 | 
			
		||||
            # Append the candidate if it passes the similarity threshold (e.g., 0.7)
 | 
			
		||||
            if best_ratio > 0.7 and best_candidate is not None:
 | 
			
		||||
                votes.append(best_candidate)
 | 
			
		||||
 | 
			
		||||
        # Only consider variants that differ from the base sentence
 | 
			
		||||
        variant_votes = [vote for vote in votes if vote != b_sentence]
 | 
			
		||||
        # Only consider variants that differ when compared case-insensitively
 | 
			
		||||
        variant_votes = [vote for vote in votes if vote.lower() != b_sentence.lower()]
 | 
			
		||||
        if variant_votes:
 | 
			
		||||
            print("Base Sentence:")
 | 
			
		||||
            print(b_sentence)
 | 
			
		||||
@ -99,4 +105,4 @@ def main():
 | 
			
		||||
        print("=" * 80)
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    main()
 | 
			
		||||
    main()
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user