mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-16 05:25:55 +00:00

### Description * If the contents of a doc were updated by the process of reading/downloading it, this was not being persisted. To fix this, the data being passed around was updated to use a multiprocessing safe dict rather than the json string. Now that dict is updated after the `get_file` method is called. * Wikipedia connector was updated to use a static filename rather than one requiring a call to fetch data. * The read config param `re_download` was not being leveraged by the source node, this was fixed. * Added fix: chunking and embedding order reversed so chunking runs before embeddings --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
775 lines
23 KiB
JSON
775 lines
23 KiB
JSON
[
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "79f952030bc16724f51bcd4264fa9e60",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3 testtext3"
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "7be2188650903e8cb47b4ad6834f2855",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Testdoc3 List Item 1Testdoc3 List Item 1 Nested Item ATestdoc3 List Item 1 Nested Item B"
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "150c2a750c17f7ba420ee3b2fa5ddd05",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Testdoc3 List Item 2"
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "58afe13278b87a97553da292ff2f4a5d",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Testdoc3 List Item 3"
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "b2aabb64faf2e75a269afcf0b9069769",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Testdoc3 List Item 4"
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "a823ee15866f03f883ff4c8e3fd74d0a",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Testdoc3 List Item 5"
|
|
},
|
|
{
|
|
"type": "NarrativeText",
|
|
"element_id": "3d15d9222ffb2770ceede5b6532e842a",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1,
|
|
"link_urls": [
|
|
"https://www.unstructured.io/"
|
|
],
|
|
"link_texts": [
|
|
"This is the link for unstructured . io."
|
|
]
|
|
},
|
|
"text": "This is the link for unstructured . io."
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "2afcc8bc3f0e6780b736c4c008407494",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1,
|
|
"emphasized_text_contents": [
|
|
"Testdoc3 Checklist Item 1"
|
|
],
|
|
"emphasized_text_tags": [
|
|
"span"
|
|
]
|
|
},
|
|
"text": "Testdoc3 Checklist Item 1"
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "7b1f1e10038f0059bc10eb5029387026",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1,
|
|
"emphasized_text_contents": [
|
|
"Testdoc3 Checklist Item 2 (checked)"
|
|
],
|
|
"emphasized_text_tags": [
|
|
"span"
|
|
]
|
|
},
|
|
"text": "Testdoc3 Checklist Item 2 (checked)"
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "63da5a7ac980c213a66c1d12b41850c9",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1,
|
|
"emphasized_text_contents": [
|
|
"Testdoc3 Checklist Item 3"
|
|
],
|
|
"emphasized_text_tags": [
|
|
"span"
|
|
]
|
|
},
|
|
"text": "Testdoc3 Checklist Item 3"
|
|
},
|
|
{
|
|
"type": "UncategorizedText",
|
|
"element_id": "86e0e9ce51725074a3dc41ed3f10c13c",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "😃 😃 😃 😃 😃 😃 😃 😃 😃 😃 😃 😃"
|
|
},
|
|
{
|
|
"type": "NarrativeText",
|
|
"element_id": "7ea4536a8f68aea99e35afc364d1a66d",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1,
|
|
"emphasized_text_contents": [
|
|
"Testdoc3 bold text"
|
|
],
|
|
"emphasized_text_tags": [
|
|
"strong"
|
|
]
|
|
},
|
|
"text": "Testdoc3 bold text"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "b6eca69d7b7cf8e67fca6fe167e1ae3d",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1,
|
|
"emphasized_text_contents": [
|
|
"Testdoc3 italic text"
|
|
],
|
|
"emphasized_text_tags": [
|
|
"em"
|
|
]
|
|
},
|
|
"text": "Testdoc3 italic text"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "3e1de41c1e2ae05a3b445ca757d5dcad",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Testdoc3 Heading 1 Sized Text"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "5c99ed47550665c65505f840d3b86b8f",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Testdoc3 Heading 2 Sized Text"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "5595c0e3b14f2eac3834aa78a2fc1027",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Testdoc3 Heading 3 Sized Text"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "0f55b2b1d85ed7f62e3bd852ba3d8c51",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Testdoc3 Heading 4 Sized Text"
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "d0218c7c71bb57c842286628e5a4e8d6",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Testdoc3 Heading 5 Sized Text"
|
|
},
|
|
{
|
|
"type": "Table",
|
|
"element_id": "5abf3e1bbc85012fe9e1d25966e00f5e",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605989",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605989"
|
|
},
|
|
"date_created": "2023-07-09T12:56:40.842000",
|
|
"date_modified": "2023-07-09T12:57:59.173000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng",
|
|
"fra"
|
|
],
|
|
"page_number": 1,
|
|
"text_as_html": "<table><br><tbody><br><tr><td>Testdoc3 Table: Column 1 Row 0</td><td>Testdoc3 Table: Column 2 Row 0</td><td>Testdoc3 Table: Column 3 Row 0</td></tr><br><tr><td>Testdoc3 Table: Column 1 Row 1</td><td>Testdoc3 Table: Column 2 Row 1</td><td>Testdoc3 Table: Column 3 Row 1</td></tr><br><tr><td>Testdoc3 Table: Column 1 Row 2</td><td>Testdoc3 Table: Column 2 Row 2</td><td>Testdoc3 Table: Column 3 Row 2</td></tr><br></tbody><br></table>"
|
|
},
|
|
"text": "Testdoc3 Table: Column 1 Row 0 Testdoc3 Table: Column 2 Row 0 Testdoc3 Table: Column 3 Row 0 Testdoc3 Table: Column 1 Row 1 Testdoc3 Table: Column 2 Row 1 Testdoc3 Table: Column 3 Row 1 Testdoc3 Table: Column 1 Row 2 Testdoc3 Table: Column 2 Row 2 Testdoc3 Table: Column 3 Row 2"
|
|
}
|
|
] |