mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-12 19:45:56 +00:00

### Description * If the contents of a doc were updated by the process of reading/downloading it, this was not being persisted. To fix this, the data being passed around was updated to use a multiprocessing safe dict rather than the json string. Now that dict is updated after the `get_file` method is called. * Wikipedia connector was updated to use a static filename rather than one requiring a call to fetch data. * The read config param `re_download` was not being leveraged by the source node, this was fixed. * Added fix: chunking and embedding order reversed so chunking runs before embeddings --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
245 lines
7.1 KiB
JSON
245 lines
7.1 KiB
JSON
[
|
||
{
|
||
"type": "Title",
|
||
"element_id": "35054d4d1455c734e83a868656b4ad16",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDDD3 Date"
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "0126c1353ddd7c8dfdb29f252a64a344",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDC65 Participants"
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "fa64ff027cbc0c6929bc75d3c78c94c3",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83E\\uDD45 Goals"
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "537ea1b14dcba1742bdbd4a5fbfb488c",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "\\uD83D\\uDDE3 Discussion topics"
|
||
},
|
||
{
|
||
"type": "Table",
|
||
"element_id": "37af06e8e75d96a448a00026754b7942",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1,
|
||
"text_as_html": "<table><br><tbody><br><tr><td>Time</td><td>Item</td><td>Presenter</td><td>Notes</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
|
||
},
|
||
"text": "Time Item Presenter Notes"
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "f158a8eaf72c7e9511d5e8ee03692652",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "✅ Action items"
|
||
},
|
||
{
|
||
"type": "ListItem",
|
||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": ""
|
||
},
|
||
{
|
||
"type": "Title",
|
||
"element_id": "addb0aa08f77b69fa754ba55c6600c8a",
|
||
"metadata": {
|
||
"data_source": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605928",
|
||
"version": "1",
|
||
"record_locator": {
|
||
"url": "https://unstructured-ingest-test.atlassian.net",
|
||
"page_id": "1605928"
|
||
},
|
||
"date_created": "2023-07-09T12:54:45.162000",
|
||
"date_modified": "2023-07-09T12:54:45.162000"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"eng"
|
||
],
|
||
"page_number": 1
|
||
},
|
||
"text": "⤴ Decisions"
|
||
}
|
||
] |