from unstructured.chunking.title import ( _split_elements_by_title_and_table, chunk_by_title, ) from unstructured.documents.elements import ( CheckBox, CompositeElement, ElementMetadata, Table, Text, Title, ) def test_split_elements_by_title_and_table(): elements = [ Title("A Great Day"), Text("Today is a great day."), Text("It is sunny outside."), Table("
"), Title("An Okay Day"), Text("Today is an okay day."), Text("It is rainy outside."), Title("A Bad Day"), Text("Today is a bad day."), Text("It is storming outside."), CheckBox(), ] sections = _split_elements_by_title_and_table(elements, combine_under_n_chars=0) assert sections == [ [ Title("A Great Day"), Text("Today is a great day."), Text("It is sunny outside."), ], [ Table("
"), ], [ Title("An Okay Day"), Text("Today is an okay day."), Text("It is rainy outside."), ], [ Title("A Bad Day"), Text("Today is a bad day."), Text("It is storming outside."), ], [ CheckBox(), ], ] def test_chunk_by_title(): elements = [ Title("A Great Day", metadata=ElementMetadata(emphasized_text_contents=["Day"])), Text("Today is a great day.", metadata=ElementMetadata(emphasized_text_contents=["day"])), Text("It is sunny outside."), Table("
"), Title("An Okay Day"), Text("Today is an okay day."), Text("It is rainy outside."), Title("A Bad Day"), Text( "Today is a bad day.", metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]), ), Text("It is storming outside."), CheckBox(), ] chunks = chunk_by_title(elements, combine_under_n_chars=0) assert chunks == [ CompositeElement( "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.", ), Table("
"), CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."), CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", ), CheckBox(), ] assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"]) assert chunks[3].metadata == ElementMetadata( regex_metadata=[{"text": "A", "start": 11, "end": 12}], ) def test_chunk_by_title_respects_section_change(): elements = [ Title("A Great Day", metadata=ElementMetadata(section="first")), Text("Today is a great day.", metadata=ElementMetadata(section="second")), Text("It is sunny outside.", metadata=ElementMetadata(section="second")), Table("
"), Title("An Okay Day"), Text("Today is an okay day."), Text("It is rainy outside."), Title("A Bad Day"), Text( "Today is a bad day.", metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]), ), Text("It is storming outside."), CheckBox(), ] chunks = chunk_by_title(elements, combine_under_n_chars=0) assert chunks == [ CompositeElement( "A Great Day", ), CompositeElement( "Today is a great day.\n\nIt is sunny outside.", ), Table("
"), CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."), CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", ), CheckBox(), ] def test_chunk_by_title_separates_by_page_number(): elements = [ Title("A Great Day", metadata=ElementMetadata(page_number=1)), Text("Today is a great day.", metadata=ElementMetadata(page_number=2)), Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)), Table("
"), Title("An Okay Day"), Text("Today is an okay day."), Text("It is rainy outside."), Title("A Bad Day"), Text( "Today is a bad day.", metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]), ), Text("It is storming outside."), CheckBox(), ] chunks = chunk_by_title(elements, multipage_sections=False, combine_under_n_chars=0) assert chunks == [ CompositeElement( "A Great Day", ), CompositeElement( "Today is a great day.\n\nIt is sunny outside.", ), Table("
"), CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."), CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", ), CheckBox(), ] def test_chunk_by_title_groups_across_pages(): elements = [ Title("A Great Day", metadata=ElementMetadata(page_number=1)), Text("Today is a great day.", metadata=ElementMetadata(page_number=2)), Text("It is sunny outside.", metadata=ElementMetadata(page_number=2)), Table("
"), Title("An Okay Day"), Text("Today is an okay day."), Text("It is rainy outside."), Title("A Bad Day"), Text( "Today is a bad day.", metadata=ElementMetadata(regex_metadata=[{"text": "A", "start": 0, "end": 1}]), ), Text("It is storming outside."), CheckBox(), ] chunks = chunk_by_title(elements, multipage_sections=True, combine_under_n_chars=0) assert chunks == [ CompositeElement( "A Great Day\n\nToday is a great day.\n\nIt is sunny outside.", ), Table("
"), CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."), CompositeElement( "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", ), CheckBox(), ]