mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-12 19:45:56 +00:00

### Description * If the contents of a doc were updated by the process of reading/downloading it, this was not being persisted. To fix this, the data being passed around was updated to use a multiprocessing safe dict rather than the json string. Now that dict is updated after the `get_file` method is called. * Wikipedia connector was updated to use a static filename rather than one requiring a call to fetch data. * The read config param `re_download` was not being leveraged by the source node, this was fixed. * Added fix: chunking and embedding order reversed so chunking runs before embeddings --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
307 lines
8.4 KiB
JSON
307 lines
8.4 KiB
JSON
[
|
|
{
|
|
"type": "Title",
|
|
"element_id": "307afee17dac4c598e361c095338decd",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": "Copy and paste this section for each week."
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "b980a145c5e8c9e233a0643366ba520a",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"emphasized_text_contents": [
|
|
"Win"
|
|
],
|
|
"emphasized_text_tags": [
|
|
"strong"
|
|
]
|
|
},
|
|
"text": "Win"
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": ""
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": ""
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": ""
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "aecc044c7725a6555114285dc28fe2d1",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"emphasized_text_contents": [
|
|
"Needs input"
|
|
],
|
|
"emphasized_text_tags": [
|
|
"strong"
|
|
]
|
|
},
|
|
"text": "Needs input"
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": ""
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": ""
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": ""
|
|
},
|
|
{
|
|
"type": "Title",
|
|
"element_id": "9d3cab2b5efed4eaef42a707dbc813da",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"emphasized_text_contents": [
|
|
"Focus"
|
|
],
|
|
"emphasized_text_tags": [
|
|
"strong"
|
|
]
|
|
},
|
|
"text": "Focus"
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": ""
|
|
},
|
|
{
|
|
"type": "ListItem",
|
|
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1
|
|
},
|
|
"text": ""
|
|
},
|
|
{
|
|
"type": "Table",
|
|
"element_id": "a240e43c0ae70731c65ae5430d2dab7f",
|
|
"metadata": {
|
|
"data_source": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
|
|
"version": "1",
|
|
"record_locator": {
|
|
"url": "https://unstructured-ingest-test.atlassian.net",
|
|
"page_id": "1605942"
|
|
},
|
|
"date_created": "2023-07-09T12:54:45.226000",
|
|
"date_modified": "2023-07-09T12:54:45.226000"
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"text_as_html": "<table><br><tbody><br><tr><td>Notes </td></tr><br><tr><td>Important Links</td></tr><br></tbody><br></table>"
|
|
},
|
|
"text": "Notes Important Links"
|
|
}
|
|
] |