mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-11 16:13:24 +00:00
Adds data source properties to elasticsearch, wikipedia and google-drive (#1282)
This commit is contained in:
parent
92e18c3f58
commit
9a3e24fcbb
@ -1,9 +1,9 @@
|
||||
## 0.10.16-dev1
|
||||
## 0.10.16-dev2
|
||||
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Adds data source properties to Airtable, Confluence and Discord connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
|
||||
* **Adds data source properties to Airtable, Confluence, Discord, Elasticsearch, Google Drive, and Wikipedia connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
|
||||
|
||||
### Features
|
||||
|
||||
|
@ -3,7 +3,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "0"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American"
|
||||
@ -12,7 +19,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0d15a8bf4961deb609a392a8444e3520",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "0"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Cecil Hepworth"
|
||||
@ -21,7 +35,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "a1515877c1c63770057b2615cce25c5d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "0"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\""
|
||||
@ -30,7 +51,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "58ec505c394f8af4fc5c62bad6973652",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "0"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "She enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves."
|
||||
@ -39,7 +67,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "fffac28d27f8cea00e96f1e876a1d1f8",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "0"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "The Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream."
|
||||
|
@ -3,7 +3,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "1"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American"
|
||||
@ -12,7 +19,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "576608bb13aa67420e79d575e0e26071",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "1"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Wallace McCutcheon and Ediwin S. Porter"
|
||||
@ -21,7 +35,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "d30707943c5b8e45088e21b0a9ba6f1a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "1"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief. [2]"
|
||||
|
@ -3,7 +3,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "2"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American"
|
||||
@ -12,7 +19,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "b764cdc0eab7137467211272fa539f12",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "2"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Unknown"
|
||||
@ -21,7 +35,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "2659129a67b301911027d0ea747109e4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "2"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication. [1]"
|
||||
|
@ -3,7 +3,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "3"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American"
|
||||
@ -12,7 +19,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "9496aba3ea633310e2d669820269ad00",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "3"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Edwin Stanton Porter"
|
||||
@ -21,7 +35,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6ead4aca7b509813147c41699dd1a7d4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "3"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers."
|
||||
|
@ -3,7 +3,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "4"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American"
|
||||
@ -12,7 +19,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "4"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "D. W. Griffith"
|
||||
@ -21,7 +35,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "b8004022d0994669c5fdb4ec8a5088a9",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "4"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents."
|
||||
|
@ -3,7 +3,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "5"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American"
|
||||
@ -12,7 +19,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "5"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "D. W. Griffith"
|
||||
@ -21,7 +35,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "8e16a508f3df737af12e84d9cba2c7d0",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "5"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house."
|
||||
|
@ -3,7 +3,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "6"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American"
|
||||
@ -12,7 +19,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "2ac7798b427181278fb2b450e28f4902",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "6"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "D.W. Griffith"
|
||||
@ -21,7 +35,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "9e92dee6e0d6ef246f51d7f8f4eb8c01",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "6"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings."
|
||||
|
@ -3,7 +3,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "7"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American"
|
||||
@ -12,7 +19,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "7"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "D. W. Griffith"
|
||||
@ -21,7 +35,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "d366dfc3239f22e3c03ee629f6567a68",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "7"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\""
|
||||
|
@ -3,7 +3,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "8"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American"
|
||||
@ -12,7 +19,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "b764cdc0eab7137467211272fa539f12",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "8"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "Unknown"
|
||||
@ -21,7 +35,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "7ddfc82896f749f2c5b5c5baac5a93bf",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "8"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release. [2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life."
|
||||
|
@ -3,7 +3,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "9"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "American"
|
||||
@ -12,7 +19,14 @@
|
||||
"type": "Title",
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "9"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "D. W. Griffith"
|
||||
@ -21,7 +35,14 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "b87d0bbbe5c735bca621fc172fc44605",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"version": 1,
|
||||
"record_locator": {
|
||||
"url": "http://localhost:9200",
|
||||
"index_name": "movies",
|
||||
"document_id": "9"
|
||||
}
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town."
|
||||
|
@ -3,7 +3,16 @@
|
||||
"type": "Title",
|
||||
"element_id": "7e8cd2056da73a7fefb6cd91f4e5d199",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"url": "https://drive.google.com/uc?id=117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8&export=download",
|
||||
"version": "15",
|
||||
"record_locator": {
|
||||
"drive_id": "1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr",
|
||||
"file_id": "117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8"
|
||||
},
|
||||
"date_created": "2023-06-15T06:15:58.931000",
|
||||
"date_modified": "2023-06-15T06:15:44"
|
||||
},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"emphasized_text_contents": [
|
||||
"Title"
|
||||
@ -18,7 +27,16 @@
|
||||
"type": "NarrativeText",
|
||||
"element_id": "9870998df89c1da4e01378d0fd085106",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"url": "https://drive.google.com/uc?id=117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8&export=download",
|
||||
"version": "15",
|
||||
"record_locator": {
|
||||
"drive_id": "1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr",
|
||||
"file_id": "117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8"
|
||||
},
|
||||
"date_created": "2023-06-15T06:15:58.931000",
|
||||
"date_modified": "2023-06-15T06:15:44"
|
||||
},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"emphasized_text_contents": [
|
||||
"This is a good reason to continue"
|
||||
|
@ -3,7 +3,16 @@
|
||||
"type": "Title",
|
||||
"element_id": "dd14cbbf0e74909aac7f248a85d190af",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"url": "https://drive.google.com/uc?id=1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o&export=download",
|
||||
"version": "17",
|
||||
"record_locator": {
|
||||
"drive_id": "1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr",
|
||||
"file_id": "1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o"
|
||||
},
|
||||
"date_created": "2023-06-15T06:15:59.687000",
|
||||
"date_modified": "2023-06-15T06:15:43"
|
||||
},
|
||||
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
},
|
||||
"text": "Lorem ipsum dolor sit amet."
|
||||
|
@ -3,7 +3,16 @@
|
||||
"type": "Title",
|
||||
"element_id": "8b5b9db0c13db24256c829aa364aa90c",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"data_source": {
|
||||
"url": "https://drive.google.com/uc?id=1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC&export=download",
|
||||
"version": "9",
|
||||
"record_locator": {
|
||||
"drive_id": "1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr",
|
||||
"file_id": "1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC"
|
||||
},
|
||||
"date_created": "2023-06-15T06:15:59.687000",
|
||||
"date_modified": "2023-06-15T06:15:39"
|
||||
},
|
||||
"filetype": "text/plain"
|
||||
},
|
||||
"text": "three"
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.10.16-dev1" # pragma: no cover
|
||||
__version__ = "0.10.16-dev2" # pragma: no cover
|
||||
|
@ -12,6 +12,7 @@ from unstructured.ingest.interfaces import (
|
||||
BaseSourceConnector,
|
||||
IngestDocCleanupMixin,
|
||||
SourceConnectorCleanupMixin,
|
||||
SourceMetadata,
|
||||
)
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.utils import requires_dependencies
|
||||
@ -33,7 +34,7 @@ class SimpleElasticsearchConfig(BaseConnectorConfig):
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElasticsearchFileMeta:
|
||||
class ElasticsearchDocumentMeta:
|
||||
"""Metadata specifying:
|
||||
name of the elasticsearch index that is being reached to,
|
||||
and the id of document that is being reached to,
|
||||
@ -53,7 +54,7 @@ class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
"""
|
||||
|
||||
connector_config: SimpleElasticsearchConfig
|
||||
file_meta: ElasticsearchFileMeta
|
||||
document_meta: ElasticsearchDocumentMeta
|
||||
registry_name: str = "elasticsearch"
|
||||
|
||||
# TODO: remove one of filename or _tmp_download_file, using a wrapper
|
||||
@ -61,8 +62,8 @@ class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
def filename(self):
|
||||
return (
|
||||
Path(self.read_config.download_dir)
|
||||
/ self.file_meta.index_name
|
||||
/ f"{self.file_meta.document_id}.txt"
|
||||
/ self.document_meta.index_name
|
||||
/ f"{self.document_meta.document_id}.txt"
|
||||
).resolve()
|
||||
|
||||
@property
|
||||
@ -71,7 +72,7 @@ class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
the output file."""
|
||||
# Generate SHA256 hash and take the first 8 characters
|
||||
query_hash = hashlib.sha256((self.connector_config.jq_query or "").encode()).hexdigest()[:8]
|
||||
output_file = f"{self.file_meta.document_id}-{query_hash}.json"
|
||||
output_file = f"{self.document_meta.document_id}-{query_hash}.json"
|
||||
return (
|
||||
Path(self.partition_config.output_dir) / self.connector_config.index_name / output_file
|
||||
)
|
||||
@ -104,21 +105,50 @@ class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
concatenated_values = seperator.join(values)
|
||||
return concatenated_values
|
||||
|
||||
@requires_dependencies(["elasticsearch"], extras="elasticsearch")
|
||||
def _get_document(self):
|
||||
from elasticsearch import Elasticsearch, NotFoundError
|
||||
|
||||
try:
|
||||
# TODO: instead of having a separate client for each doc,
|
||||
# have a separate client for each process
|
||||
es = Elasticsearch(self.connector_config.url)
|
||||
document = es.get(
|
||||
index=self.connector_config.index_name,
|
||||
id=self.document_meta.document_id,
|
||||
)
|
||||
except NotFoundError:
|
||||
logger.error("Couldn't find document with ID: %s", self.document_meta.document_id)
|
||||
return None
|
||||
return document
|
||||
|
||||
def update_source_metadata(self, **kwargs):
|
||||
document = kwargs.get("document", self._get_document())
|
||||
if document is None:
|
||||
self.source_metadata = SourceMetadata(
|
||||
exists=False,
|
||||
)
|
||||
return
|
||||
self.source_metadata = SourceMetadata(
|
||||
version=document["_version"],
|
||||
exists=document["found"],
|
||||
)
|
||||
|
||||
@SourceConnectionError.wrap
|
||||
@requires_dependencies(["elasticsearch", "jq"], extras="elasticsearch")
|
||||
@requires_dependencies(["jq"], extras="elasticsearch")
|
||||
@BaseIngestDoc.skip_if_file_exists
|
||||
def get_file(self):
|
||||
import jq
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
logger.debug(f"Fetching {self} - PID: {os.getpid()}")
|
||||
# TODO: instead of having a separate client for each doc,
|
||||
# have a separate client for each process
|
||||
es = Elasticsearch(self.connector_config.url)
|
||||
document_dict = es.get(
|
||||
index=self.connector_config.index_name,
|
||||
id=self.file_meta.document_id,
|
||||
).body["_source"]
|
||||
document = self._get_document()
|
||||
self.update_source_metadata(document=document)
|
||||
if document is None:
|
||||
raise ValueError(
|
||||
f"Failed to get document {self.document_meta.document_id}",
|
||||
)
|
||||
|
||||
document_dict = document.body["_source"]
|
||||
if self.connector_config.jq_query:
|
||||
document_dict = json.loads(
|
||||
jq.compile(self.connector_config.jq_query).input(document_dict).text(),
|
||||
@ -128,6 +158,26 @@ class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
with open(self.filename, "w", encoding="utf8") as f:
|
||||
f.write(self.document)
|
||||
|
||||
@property
|
||||
def date_created(self) -> t.Optional[str]:
|
||||
return None
|
||||
|
||||
@property
|
||||
def date_modified(self) -> t.Optional[str]:
|
||||
return None
|
||||
|
||||
@property
|
||||
def source_url(self) -> t.Optional[str]:
|
||||
return None
|
||||
|
||||
@property
|
||||
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
||||
return {
|
||||
"url": self.connector_config.url,
|
||||
"index_name": self.connector_config.index_name,
|
||||
"document_id": self.document_meta.document_id,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElasticsearchSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector):
|
||||
@ -166,7 +216,7 @@ class ElasticsearchSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnec
|
||||
connector_config=self.connector_config,
|
||||
partition_config=self.partition_config,
|
||||
read_config=self.read_config,
|
||||
file_meta=ElasticsearchFileMeta(self.connector_config.index_name, id),
|
||||
document_meta=ElasticsearchDocumentMeta(self.connector_config.index_name, id),
|
||||
)
|
||||
for id in ids
|
||||
]
|
||||
|
@ -3,6 +3,7 @@ import json
|
||||
import os
|
||||
import typing as t
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from mimetypes import guess_extension
|
||||
from pathlib import Path
|
||||
|
||||
@ -18,6 +19,7 @@ from unstructured.ingest.interfaces import (
|
||||
IngestDocCleanupMixin,
|
||||
IngestDocSessionHandleMixin,
|
||||
SourceConnectorCleanupMixin,
|
||||
SourceMetadata,
|
||||
)
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.utils import requires_dependencies
|
||||
@ -103,57 +105,109 @@ class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig):
|
||||
@dataclass
|
||||
class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseIngestDoc):
|
||||
connector_config: SimpleGoogleDriveConfig
|
||||
file_meta: t.Dict[str, str]
|
||||
meta: t.Dict[str, str]
|
||||
registry_name: str = "google_drive"
|
||||
|
||||
@property
|
||||
def filename(self):
|
||||
return Path(self.file_meta.get("download_filepath")).resolve() # type: ignore
|
||||
return Path(self.meta.get("download_filepath")).resolve() # type: ignore
|
||||
|
||||
@property
|
||||
def _output_filename(self):
|
||||
return Path(f"{self.file_meta.get('output_filepath')}.json").resolve()
|
||||
return Path(f"{self.meta.get('output_filepath')}.json").resolve()
|
||||
|
||||
@property
|
||||
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
||||
return {
|
||||
"drive_id": self.connector_config.drive_id,
|
||||
"file_id": self.meta["id"],
|
||||
}
|
||||
|
||||
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
||||
def update_source_metadata(self):
|
||||
from googleapiclient.errors import HttpError
|
||||
|
||||
try:
|
||||
file_obj = (
|
||||
self.session_handle.service.files()
|
||||
.get(
|
||||
fileId=self.meta["id"],
|
||||
fields="id, createdTime, modifiedTime, version, webContentLink",
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
except HttpError as e:
|
||||
if e.status_code == 404:
|
||||
logger.error(f"File {self.meta['name']} not found")
|
||||
self.source_metadata = SourceMetadata(
|
||||
exists=True,
|
||||
)
|
||||
return
|
||||
raise
|
||||
|
||||
date_created = None
|
||||
if dc := file_obj.get("createdTime", ""):
|
||||
date_created = datetime.strptime(
|
||||
dc,
|
||||
"%Y-%m-%dT%H:%M:%S.%fZ",
|
||||
).isoformat()
|
||||
|
||||
date_modified = None
|
||||
if dm := file_obj.get("modifiedTime", ""):
|
||||
date_modified = datetime.strptime(
|
||||
dm,
|
||||
"%Y-%m-%dT%H:%M:%S.%fZ",
|
||||
).isoformat()
|
||||
|
||||
self.source_metadata = SourceMetadata(
|
||||
date_created=date_created,
|
||||
date_modified=date_modified,
|
||||
version=file_obj.get("version", ""),
|
||||
source_url=file_obj.get("webContentLink", ""),
|
||||
exists=True,
|
||||
)
|
||||
|
||||
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
||||
@SourceConnectionError.wrap
|
||||
@BaseIngestDoc.skip_if_file_exists
|
||||
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
||||
def get_file(self):
|
||||
from googleapiclient.errors import HttpError
|
||||
from googleapiclient.http import MediaIoBaseDownload
|
||||
|
||||
if self.file_meta.get("mimeType", "").startswith("application/vnd.google-apps"):
|
||||
if self.meta.get("mimeType", "").startswith("application/vnd.google-apps"):
|
||||
export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(
|
||||
self.file_meta.get("mimeType"), # type: ignore
|
||||
self.meta.get("mimeType"), # type: ignore
|
||||
)
|
||||
if not export_mime:
|
||||
logger.info(
|
||||
f"File not supported. Name: {self.file_meta.get('name')} "
|
||||
f"ID: {self.file_meta.get('id')} "
|
||||
f"MimeType: {self.file_meta.get('mimeType')}",
|
||||
f"File not supported. Name: {self.meta.get('name')} "
|
||||
f"ID: {self.meta.get('id')} "
|
||||
f"MimeType: {self.meta.get('mimeType')}",
|
||||
)
|
||||
return
|
||||
|
||||
request = self.session_handle.service.files().export_media(
|
||||
fileId=self.file_meta.get("id"),
|
||||
fileId=self.meta.get("id"),
|
||||
mimeType=export_mime,
|
||||
)
|
||||
else:
|
||||
request = self.session_handle.service.files().get_media(fileId=self.file_meta.get("id"))
|
||||
request = self.session_handle.service.files().get_media(fileId=self.meta.get("id"))
|
||||
file = io.BytesIO()
|
||||
downloader = MediaIoBaseDownload(file, request)
|
||||
self.update_source_metadata()
|
||||
downloaded = False
|
||||
try:
|
||||
while downloaded is False:
|
||||
status, downloaded = downloader.next_chunk()
|
||||
_, downloaded = downloader.next_chunk()
|
||||
except HttpError:
|
||||
pass
|
||||
|
||||
saved = False
|
||||
if downloaded and file:
|
||||
dir_ = Path(self.file_meta["download_dir"])
|
||||
dir_ = Path(self.meta["download_dir"])
|
||||
if dir_:
|
||||
if not dir_.is_dir():
|
||||
logger.debug(f"Creating directory: {self.file_meta.get('download_dir')}")
|
||||
logger.debug(f"Creating directory: {self.meta.get('download_dir')}")
|
||||
|
||||
if dir_:
|
||||
dir_.mkdir(parents=True, exist_ok=True)
|
||||
@ -162,7 +216,6 @@ class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, B
|
||||
handler.write(file.getbuffer())
|
||||
saved = True
|
||||
logger.debug(f"File downloaded: {self.filename}.")
|
||||
|
||||
if not saved:
|
||||
logger.error(f"Error while downloading and saving file: {self.filename}.")
|
||||
|
||||
@ -267,7 +320,7 @@ class GoogleDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnecto
|
||||
connector_config=self.connector_config,
|
||||
partition_config=self.partition_config,
|
||||
read_config=self.read_config,
|
||||
file_meta=file,
|
||||
meta=file,
|
||||
)
|
||||
for file in files
|
||||
]
|
||||
|
@ -10,6 +10,7 @@ from unstructured.ingest.interfaces import (
|
||||
BaseSourceConnector,
|
||||
IngestDocCleanupMixin,
|
||||
SourceConnectorCleanupMixin,
|
||||
SourceMetadata,
|
||||
)
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.utils import requires_dependencies
|
||||
@ -50,15 +51,49 @@ class WikipediaIngestDoc(IngestDocCleanupMixin, BaseIngestDoc):
|
||||
def _output_filename(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def date_created(self) -> t.Optional[str]:
|
||||
return None
|
||||
|
||||
@property
|
||||
def date_modified(self) -> t.Optional[str]:
|
||||
return None
|
||||
|
||||
@property
|
||||
def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]:
|
||||
return {
|
||||
"page_title": self.connector_config.title,
|
||||
"page_url": self.source_metadata.source_url, # type: ignore
|
||||
}
|
||||
|
||||
def _create_full_tmp_dir_path(self):
|
||||
self.filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@requires_dependencies(["wikipedia"], extras="wikipedia")
|
||||
def update_source_metadata(self):
|
||||
from wikipedia.exceptions import PageError
|
||||
|
||||
try:
|
||||
page = self.page
|
||||
except PageError:
|
||||
self.source_metadata = SourceMetadata(
|
||||
exists=False,
|
||||
)
|
||||
return
|
||||
|
||||
self.source_metadata = SourceMetadata(
|
||||
version=page.revision_id,
|
||||
source_url=page.url,
|
||||
exists=True,
|
||||
)
|
||||
|
||||
@SourceConnectionError.wrap
|
||||
@BaseIngestDoc.skip_if_file_exists
|
||||
def get_file(self):
|
||||
"""Fetches the "remote" doc and stores it locally on the filesystem."""
|
||||
self._create_full_tmp_dir_path()
|
||||
logger.debug(f"Fetching {self} - PID: {os.getpid()}")
|
||||
self.update_source_metadata()
|
||||
with open(self.filename, "w", encoding="utf8") as f:
|
||||
f.write(self.text)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user