Roman Isecke 135aa65906
update ingest pipeline to share ingest docs via multiprocessing.manager.dict (#1814)
### Description
* If the contents of a doc were updated by the process of
reading/downloading it, this was not being persisted. To fix this, the
data being passed around was updated to use a multiprocessing safe dict
rather than the json string. Now that dict is updated after the
`get_file` method is called.
* Wikipedia connector was updated to use a static filename rather than
one requiring a call to fetch data.
* The read config param `re_download` was not being leveraged by the
source node, this was fixed.
* Added fix: chunking and embedding order reversed so chunking runs
before embeddings

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com>
2023-10-25 22:04:27 +00:00

357 lines
12 KiB
JSON
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

[
{
"type": "Table",
"element_id": "10b5ef18a3c7fb1d7436b2e1b256e5b9",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>Driver </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Approver </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Contributors</td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Informed </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Objective </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Due date </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Key outcomes</td><td> </td><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td>Status </td><td>NOT STARTED</td><td>/</td><td>IN PROGRESS</td><td>/</td><td>COMPLETE</td></tr><br></tbody><br></table>"
},
"text": "Driver Approver Contributors Informed Objective Due date Key outcomes Status NOT STARTED / IN PROGRESS / COMPLETE"
},
{
"type": "Title",
"element_id": "4e2022d4483a407d85060675f64fbe17",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "\\uD83E\\uDD14 Problem Statement"
},
{
"type": "Title",
"element_id": "81163675915a75217e4116686fdca412",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "🎯 Scope"
},
{
"type": "Table",
"element_id": "f1f364fbde77afa0e99e8ea7ab4f7c3f",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>Must have: </td></tr><br><tr><td>Nice to have:</td></tr><br><tr><td>Not in scope:</td></tr><br></tbody><br></table>"
},
"text": "Must have: Nice to have: Not in scope:"
},
{
"type": "Title",
"element_id": "e8b61a28d07e977379b42df455a1cde4",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "\\uD83D\\uDDD3 Timeline"
},
{
"type": "Title",
"element_id": "5043f71fbc70e35c0be413d4135be99f",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Lane 1"
},
{
"type": "Title",
"element_id": "d5a2e177c588bf0c4f914baa4fae85b6",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Lane 2"
},
{
"type": "Title",
"element_id": "c98ba1acbd22a15ddddfc244cbd8a2db",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Feature 1"
},
{
"type": "Title",
"element_id": "e04620c8b3b611b3fefecef89baa63a9",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Feature 2"
},
{
"type": "Title",
"element_id": "82e522a86692cc50ee5c020c8e6ce6a0",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Feature 3"
},
{
"type": "Title",
"element_id": "822f7c45ea725c535970aab819a8ff10",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Feature 4"
},
{
"type": "Title",
"element_id": "6e0f6eca4ff17d3377c1c3e8e1f73457",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "iOS app"
},
{
"type": "Title",
"element_id": "0b60fe04b3c5c3c76371b6eca8b19c8e",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "Android app"
},
{
"type": "Title",
"element_id": "e1cc184f345d146586fb12527c4fa696",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "\\uD83D\\uDEA9 Milestones and deadlines"
},
{
"type": "Table",
"element_id": "3f4ea3840d79521680c89a91dcd883cf",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1,
"text_as_html": "<table><br><tbody><br><tr><td>Milestone</td><td>Owner</td><td>Deadline</td><td>Status</td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br><tr><td> </td><td> </td><td> </td><td> </td></tr><br></tbody><br></table>"
},
"text": "Milestone Owner Deadline Status"
},
{
"type": "Title",
"element_id": "890c9b6d8d69ca1de5fd7a8b83fe78ff",
"metadata": {
"data_source": {
"url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1540126",
"version": "1",
"record_locator": {
"url": "https://unstructured-ingest-test.atlassian.net",
"page_id": "1540126"
},
"date_created": "2023-07-09T12:55:50.911000",
"date_modified": "2023-07-09T12:56:10.564000"
},
"filetype": "text/html",
"languages": [
"eng"
],
"page_number": 1
},
"text": "\\uD83D\\uDD17 Reference materials"
}
]