diff --git a/CHANGELOG.md b/CHANGELOG.md index caeb69d41..e0557ab94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,9 @@ -## 0.10.16-dev1 +## 0.10.16-dev2 ### Enhancements -* **Adds data source properties to Airtable, Confluence and Discord connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. +* **Adds data source properties to Airtable, Confluence, Discord, Elasticsearch, Google Drive, and Wikipedia connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. ### Features diff --git a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/0-31261b6f.json b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/0-31261b6f.json index 47c91c622..f5f3ad719 100644 --- a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/0-31261b6f.json +++ b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/0-31261b6f.json @@ -3,7 +3,14 @@ "type": "Title", "element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "0" + } + }, "filetype": "text/plain" }, "text": "American" @@ -12,7 +19,14 @@ "type": "Title", "element_id": "0d15a8bf4961deb609a392a8444e3520", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "0" + } + }, "filetype": "text/plain" }, "text": "Cecil Hepworth" @@ -21,7 +35,14 @@ "type": "NarrativeText", "element_id": "a1515877c1c63770057b2615cce25c5d", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "0" + } + }, "filetype": "text/plain" }, "text": "Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"" @@ -30,7 +51,14 @@ "type": "NarrativeText", "element_id": "58ec505c394f8af4fc5c62bad6973652", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "0" + } + }, "filetype": "text/plain" }, "text": "She enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves." @@ -39,7 +67,14 @@ "type": "NarrativeText", "element_id": "fffac28d27f8cea00e96f1e876a1d1f8", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "0" + } + }, "filetype": "text/plain" }, "text": "The Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream." diff --git a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/1-31261b6f.json b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/1-31261b6f.json index a49d451d4..388c6254f 100644 --- a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/1-31261b6f.json +++ b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/1-31261b6f.json @@ -3,7 +3,14 @@ "type": "Title", "element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "1" + } + }, "filetype": "text/plain" }, "text": "American" @@ -12,7 +19,14 @@ "type": "Title", "element_id": "576608bb13aa67420e79d575e0e26071", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "1" + } + }, "filetype": "text/plain" }, "text": "Wallace McCutcheon and Ediwin S. Porter" @@ -21,7 +35,14 @@ "type": "NarrativeText", "element_id": "d30707943c5b8e45088e21b0a9ba6f1a", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "1" + } + }, "filetype": "text/plain" }, "text": "Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief. [2]" diff --git a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/2-31261b6f.json b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/2-31261b6f.json index 0a4290633..b0970e064 100644 --- a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/2-31261b6f.json +++ b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/2-31261b6f.json @@ -3,7 +3,14 @@ "type": "Title", "element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "2" + } + }, "filetype": "text/plain" }, "text": "American" @@ -12,7 +19,14 @@ "type": "Title", "element_id": "b764cdc0eab7137467211272fa539f12", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "2" + } + }, "filetype": "text/plain" }, "text": "Unknown" @@ -21,7 +35,14 @@ "type": "NarrativeText", "element_id": "2659129a67b301911027d0ea747109e4", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "2" + } + }, "filetype": "text/plain" }, "text": "Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication. [1]" diff --git a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/3-31261b6f.json b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/3-31261b6f.json index 67d0dd477..cf0aa9098 100644 --- a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/3-31261b6f.json +++ b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/3-31261b6f.json @@ -3,7 +3,14 @@ "type": "Title", "element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "3" + } + }, "filetype": "text/plain" }, "text": "American" @@ -12,7 +19,14 @@ "type": "Title", "element_id": "9496aba3ea633310e2d669820269ad00", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "3" + } + }, "filetype": "text/plain" }, "text": "Edwin Stanton Porter" @@ -21,7 +35,14 @@ "type": "NarrativeText", "element_id": "6ead4aca7b509813147c41699dd1a7d4", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "3" + } + }, "filetype": "text/plain" }, "text": "The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers." diff --git a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/4-31261b6f.json b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/4-31261b6f.json index 14d3459d2..e017d7165 100644 --- a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/4-31261b6f.json +++ b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/4-31261b6f.json @@ -3,7 +3,14 @@ "type": "Title", "element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "4" + } + }, "filetype": "text/plain" }, "text": "American" @@ -12,7 +19,14 @@ "type": "Title", "element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "4" + } + }, "filetype": "text/plain" }, "text": "D. W. Griffith" @@ -21,7 +35,14 @@ "type": "NarrativeText", "element_id": "b8004022d0994669c5fdb4ec8a5088a9", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "4" + } + }, "filetype": "text/plain" }, "text": "On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents." diff --git a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/5-31261b6f.json b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/5-31261b6f.json index 0fda54d1e..a550de72e 100644 --- a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/5-31261b6f.json +++ b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/5-31261b6f.json @@ -3,7 +3,14 @@ "type": "Title", "element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "5" + } + }, "filetype": "text/plain" }, "text": "American" @@ -12,7 +19,14 @@ "type": "Title", "element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "5" + } + }, "filetype": "text/plain" }, "text": "D. W. Griffith" @@ -21,7 +35,14 @@ "type": "NarrativeText", "element_id": "8e16a508f3df737af12e84d9cba2c7d0", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "5" + } + }, "filetype": "text/plain" }, "text": "A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house." diff --git a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/6-31261b6f.json b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/6-31261b6f.json index 7a3e20a01..b00d877f3 100644 --- a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/6-31261b6f.json +++ b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/6-31261b6f.json @@ -3,7 +3,14 @@ "type": "Title", "element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "6" + } + }, "filetype": "text/plain" }, "text": "American" @@ -12,7 +19,14 @@ "type": "Title", "element_id": "2ac7798b427181278fb2b450e28f4902", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "6" + } + }, "filetype": "text/plain" }, "text": "D.W. Griffith" @@ -21,7 +35,14 @@ "type": "NarrativeText", "element_id": "9e92dee6e0d6ef246f51d7f8f4eb8c01", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "6" + } + }, "filetype": "text/plain" }, "text": "A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings." diff --git a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/7-31261b6f.json b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/7-31261b6f.json index 56b1fb181..ee77cdb96 100644 --- a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/7-31261b6f.json +++ b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/7-31261b6f.json @@ -3,7 +3,14 @@ "type": "Title", "element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "7" + } + }, "filetype": "text/plain" }, "text": "American" @@ -12,7 +19,14 @@ "type": "Title", "element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "7" + } + }, "filetype": "text/plain" }, "text": "D. W. Griffith" @@ -21,7 +35,14 @@ "type": "NarrativeText", "element_id": "d366dfc3239f22e3c03ee629f6567a68", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "7" + } + }, "filetype": "text/plain" }, "text": "A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\"" diff --git a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/8-31261b6f.json b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/8-31261b6f.json index 0688d6ae3..e054e8e41 100644 --- a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/8-31261b6f.json +++ b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/8-31261b6f.json @@ -3,7 +3,14 @@ "type": "Title", "element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "8" + } + }, "filetype": "text/plain" }, "text": "American" @@ -12,7 +19,14 @@ "type": "Title", "element_id": "b764cdc0eab7137467211272fa539f12", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "8" + } + }, "filetype": "text/plain" }, "text": "Unknown" @@ -21,7 +35,14 @@ "type": "NarrativeText", "element_id": "7ddfc82896f749f2c5b5c5baac5a93bf", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "8" + } + }, "filetype": "text/plain" }, "text": "No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release. [2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life." diff --git a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/9-31261b6f.json b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/9-31261b6f.json index da810758b..efb7c85cb 100644 --- a/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/9-31261b6f.json +++ b/test_unstructured_ingest/expected-structured-output/elasticsearch/movies/9-31261b6f.json @@ -3,7 +3,14 @@ "type": "Title", "element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "9" + } + }, "filetype": "text/plain" }, "text": "American" @@ -12,7 +19,14 @@ "type": "Title", "element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "9" + } + }, "filetype": "text/plain" }, "text": "D. W. Griffith" @@ -21,7 +35,14 @@ "type": "NarrativeText", "element_id": "b87d0bbbe5c735bca621fc172fc44605", "metadata": { - "data_source": {}, + "data_source": { + "version": 1, + "record_locator": { + "url": "http://localhost:9200", + "index_name": "movies", + "document_id": "9" + } + }, "filetype": "text/plain" }, "text": "The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town." diff --git a/test_unstructured_ingest/expected-structured-output/google-drive/117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8-test-drive-doc.docx.json b/test_unstructured_ingest/expected-structured-output/google-drive/117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8-test-drive-doc.docx.json index e8b68547c..96a24d2b9 100644 --- a/test_unstructured_ingest/expected-structured-output/google-drive/117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8-test-drive-doc.docx.json +++ b/test_unstructured_ingest/expected-structured-output/google-drive/117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8-test-drive-doc.docx.json @@ -3,7 +3,16 @@ "type": "Title", "element_id": "7e8cd2056da73a7fefb6cd91f4e5d199", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://drive.google.com/uc?id=117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8&export=download", + "version": "15", + "record_locator": { + "drive_id": "1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr", + "file_id": "117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8" + }, + "date_created": "2023-06-15T06:15:58.931000", + "date_modified": "2023-06-15T06:15:44" + }, "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "emphasized_text_contents": [ "Title" @@ -18,7 +27,16 @@ "type": "NarrativeText", "element_id": "9870998df89c1da4e01378d0fd085106", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://drive.google.com/uc?id=117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8&export=download", + "version": "15", + "record_locator": { + "drive_id": "1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr", + "file_id": "117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8" + }, + "date_created": "2023-06-15T06:15:58.931000", + "date_modified": "2023-06-15T06:15:44" + }, "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "emphasized_text_contents": [ "This is a good reason to continue" diff --git a/test_unstructured_ingest/expected-structured-output/google-drive/1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o-fake.docx.json b/test_unstructured_ingest/expected-structured-output/google-drive/1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o-fake.docx.json index c5fd8c471..96ec57344 100644 --- a/test_unstructured_ingest/expected-structured-output/google-drive/1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o-fake.docx.json +++ b/test_unstructured_ingest/expected-structured-output/google-drive/1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o-fake.docx.json @@ -3,7 +3,16 @@ "type": "Title", "element_id": "dd14cbbf0e74909aac7f248a85d190af", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://drive.google.com/uc?id=1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o&export=download", + "version": "17", + "record_locator": { + "drive_id": "1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr", + "file_id": "1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o" + }, + "date_created": "2023-06-15T06:15:59.687000", + "date_modified": "2023-06-15T06:15:43" + }, "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document" }, "text": "Lorem ipsum dolor sit amet." diff --git a/test_unstructured_ingest/expected-structured-output/google-drive/1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC-foo.txt.json b/test_unstructured_ingest/expected-structured-output/google-drive/1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC-foo.txt.json index ce6319064..482b9220e 100644 --- a/test_unstructured_ingest/expected-structured-output/google-drive/1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC-foo.txt.json +++ b/test_unstructured_ingest/expected-structured-output/google-drive/1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC-foo.txt.json @@ -3,7 +3,16 @@ "type": "Title", "element_id": "8b5b9db0c13db24256c829aa364aa90c", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://drive.google.com/uc?id=1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC&export=download", + "version": "9", + "record_locator": { + "drive_id": "1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr", + "file_id": "1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC" + }, + "date_created": "2023-06-15T06:15:59.687000", + "date_modified": "2023-06-15T06:15:39" + }, "filetype": "text/plain" }, "text": "three" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a2d17fc41..b6ecb5cd2 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.16-dev1" # pragma: no cover +__version__ = "0.10.16-dev2" # pragma: no cover diff --git a/unstructured/ingest/connector/elasticsearch.py b/unstructured/ingest/connector/elasticsearch.py index d80152686..58e8aeb05 100644 --- a/unstructured/ingest/connector/elasticsearch.py +++ b/unstructured/ingest/connector/elasticsearch.py @@ -12,6 +12,7 @@ from unstructured.ingest.interfaces import ( BaseSourceConnector, IngestDocCleanupMixin, SourceConnectorCleanupMixin, + SourceMetadata, ) from unstructured.ingest.logger import logger from unstructured.utils import requires_dependencies @@ -33,7 +34,7 @@ class SimpleElasticsearchConfig(BaseConnectorConfig): @dataclass -class ElasticsearchFileMeta: +class ElasticsearchDocumentMeta: """Metadata specifying: name of the elasticsearch index that is being reached to, and the id of document that is being reached to, @@ -53,7 +54,7 @@ class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): """ connector_config: SimpleElasticsearchConfig - file_meta: ElasticsearchFileMeta + document_meta: ElasticsearchDocumentMeta registry_name: str = "elasticsearch" # TODO: remove one of filename or _tmp_download_file, using a wrapper @@ -61,8 +62,8 @@ class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): def filename(self): return ( Path(self.read_config.download_dir) - / self.file_meta.index_name - / f"{self.file_meta.document_id}.txt" + / self.document_meta.index_name + / f"{self.document_meta.document_id}.txt" ).resolve() @property @@ -71,7 +72,7 @@ class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): the output file.""" # Generate SHA256 hash and take the first 8 characters query_hash = hashlib.sha256((self.connector_config.jq_query or "").encode()).hexdigest()[:8] - output_file = f"{self.file_meta.document_id}-{query_hash}.json" + output_file = f"{self.document_meta.document_id}-{query_hash}.json" return ( Path(self.partition_config.output_dir) / self.connector_config.index_name / output_file ) @@ -104,21 +105,50 @@ class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): concatenated_values = seperator.join(values) return concatenated_values + @requires_dependencies(["elasticsearch"], extras="elasticsearch") + def _get_document(self): + from elasticsearch import Elasticsearch, NotFoundError + + try: + # TODO: instead of having a separate client for each doc, + # have a separate client for each process + es = Elasticsearch(self.connector_config.url) + document = es.get( + index=self.connector_config.index_name, + id=self.document_meta.document_id, + ) + except NotFoundError: + logger.error("Couldn't find document with ID: %s", self.document_meta.document_id) + return None + return document + + def update_source_metadata(self, **kwargs): + document = kwargs.get("document", self._get_document()) + if document is None: + self.source_metadata = SourceMetadata( + exists=False, + ) + return + self.source_metadata = SourceMetadata( + version=document["_version"], + exists=document["found"], + ) + @SourceConnectionError.wrap - @requires_dependencies(["elasticsearch", "jq"], extras="elasticsearch") + @requires_dependencies(["jq"], extras="elasticsearch") @BaseIngestDoc.skip_if_file_exists def get_file(self): import jq - from elasticsearch import Elasticsearch logger.debug(f"Fetching {self} - PID: {os.getpid()}") - # TODO: instead of having a separate client for each doc, - # have a separate client for each process - es = Elasticsearch(self.connector_config.url) - document_dict = es.get( - index=self.connector_config.index_name, - id=self.file_meta.document_id, - ).body["_source"] + document = self._get_document() + self.update_source_metadata(document=document) + if document is None: + raise ValueError( + f"Failed to get document {self.document_meta.document_id}", + ) + + document_dict = document.body["_source"] if self.connector_config.jq_query: document_dict = json.loads( jq.compile(self.connector_config.jq_query).input(document_dict).text(), @@ -128,6 +158,26 @@ class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): with open(self.filename, "w", encoding="utf8") as f: f.write(self.document) + @property + def date_created(self) -> t.Optional[str]: + return None + + @property + def date_modified(self) -> t.Optional[str]: + return None + + @property + def source_url(self) -> t.Optional[str]: + return None + + @property + def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: + return { + "url": self.connector_config.url, + "index_name": self.connector_config.index_name, + "document_id": self.document_meta.document_id, + } + @dataclass class ElasticsearchSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): @@ -166,7 +216,7 @@ class ElasticsearchSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnec connector_config=self.connector_config, partition_config=self.partition_config, read_config=self.read_config, - file_meta=ElasticsearchFileMeta(self.connector_config.index_name, id), + document_meta=ElasticsearchDocumentMeta(self.connector_config.index_name, id), ) for id in ids ] diff --git a/unstructured/ingest/connector/google_drive.py b/unstructured/ingest/connector/google_drive.py index 3de62e2f0..3144677e6 100644 --- a/unstructured/ingest/connector/google_drive.py +++ b/unstructured/ingest/connector/google_drive.py @@ -3,6 +3,7 @@ import json import os import typing as t from dataclasses import dataclass +from datetime import datetime from mimetypes import guess_extension from pathlib import Path @@ -18,6 +19,7 @@ from unstructured.ingest.interfaces import ( IngestDocCleanupMixin, IngestDocSessionHandleMixin, SourceConnectorCleanupMixin, + SourceMetadata, ) from unstructured.ingest.logger import logger from unstructured.utils import requires_dependencies @@ -103,57 +105,109 @@ class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig): @dataclass class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseIngestDoc): connector_config: SimpleGoogleDriveConfig - file_meta: t.Dict[str, str] + meta: t.Dict[str, str] registry_name: str = "google_drive" @property def filename(self): - return Path(self.file_meta.get("download_filepath")).resolve() # type: ignore + return Path(self.meta.get("download_filepath")).resolve() # type: ignore @property def _output_filename(self): - return Path(f"{self.file_meta.get('output_filepath')}.json").resolve() + return Path(f"{self.meta.get('output_filepath')}.json").resolve() + @property + def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: + return { + "drive_id": self.connector_config.drive_id, + "file_id": self.meta["id"], + } + + @requires_dependencies(["googleapiclient"], extras="google-drive") + def update_source_metadata(self): + from googleapiclient.errors import HttpError + + try: + file_obj = ( + self.session_handle.service.files() + .get( + fileId=self.meta["id"], + fields="id, createdTime, modifiedTime, version, webContentLink", + ) + .execute() + ) + except HttpError as e: + if e.status_code == 404: + logger.error(f"File {self.meta['name']} not found") + self.source_metadata = SourceMetadata( + exists=True, + ) + return + raise + + date_created = None + if dc := file_obj.get("createdTime", ""): + date_created = datetime.strptime( + dc, + "%Y-%m-%dT%H:%M:%S.%fZ", + ).isoformat() + + date_modified = None + if dm := file_obj.get("modifiedTime", ""): + date_modified = datetime.strptime( + dm, + "%Y-%m-%dT%H:%M:%S.%fZ", + ).isoformat() + + self.source_metadata = SourceMetadata( + date_created=date_created, + date_modified=date_modified, + version=file_obj.get("version", ""), + source_url=file_obj.get("webContentLink", ""), + exists=True, + ) + + @requires_dependencies(["googleapiclient"], extras="google-drive") @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists - @requires_dependencies(["googleapiclient"], extras="google-drive") def get_file(self): from googleapiclient.errors import HttpError from googleapiclient.http import MediaIoBaseDownload - if self.file_meta.get("mimeType", "").startswith("application/vnd.google-apps"): + if self.meta.get("mimeType", "").startswith("application/vnd.google-apps"): export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get( - self.file_meta.get("mimeType"), # type: ignore + self.meta.get("mimeType"), # type: ignore ) if not export_mime: logger.info( - f"File not supported. Name: {self.file_meta.get('name')} " - f"ID: {self.file_meta.get('id')} " - f"MimeType: {self.file_meta.get('mimeType')}", + f"File not supported. Name: {self.meta.get('name')} " + f"ID: {self.meta.get('id')} " + f"MimeType: {self.meta.get('mimeType')}", ) return request = self.session_handle.service.files().export_media( - fileId=self.file_meta.get("id"), + fileId=self.meta.get("id"), mimeType=export_mime, ) else: - request = self.session_handle.service.files().get_media(fileId=self.file_meta.get("id")) + request = self.session_handle.service.files().get_media(fileId=self.meta.get("id")) file = io.BytesIO() downloader = MediaIoBaseDownload(file, request) + self.update_source_metadata() downloaded = False try: while downloaded is False: - status, downloaded = downloader.next_chunk() + _, downloaded = downloader.next_chunk() except HttpError: pass saved = False if downloaded and file: - dir_ = Path(self.file_meta["download_dir"]) + dir_ = Path(self.meta["download_dir"]) if dir_: if not dir_.is_dir(): - logger.debug(f"Creating directory: {self.file_meta.get('download_dir')}") + logger.debug(f"Creating directory: {self.meta.get('download_dir')}") if dir_: dir_.mkdir(parents=True, exist_ok=True) @@ -162,7 +216,6 @@ class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, B handler.write(file.getbuffer()) saved = True logger.debug(f"File downloaded: {self.filename}.") - if not saved: logger.error(f"Error while downloading and saving file: {self.filename}.") @@ -267,7 +320,7 @@ class GoogleDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnecto connector_config=self.connector_config, partition_config=self.partition_config, read_config=self.read_config, - file_meta=file, + meta=file, ) for file in files ] diff --git a/unstructured/ingest/connector/wikipedia.py b/unstructured/ingest/connector/wikipedia.py index fe35607b8..d07415e7e 100644 --- a/unstructured/ingest/connector/wikipedia.py +++ b/unstructured/ingest/connector/wikipedia.py @@ -10,6 +10,7 @@ from unstructured.ingest.interfaces import ( BaseSourceConnector, IngestDocCleanupMixin, SourceConnectorCleanupMixin, + SourceMetadata, ) from unstructured.ingest.logger import logger from unstructured.utils import requires_dependencies @@ -50,15 +51,49 @@ class WikipediaIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): def _output_filename(self): raise NotImplementedError() + @property + def date_created(self) -> t.Optional[str]: + return None + + @property + def date_modified(self) -> t.Optional[str]: + return None + + @property + def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: + return { + "page_title": self.connector_config.title, + "page_url": self.source_metadata.source_url, # type: ignore + } + def _create_full_tmp_dir_path(self): self.filename.parent.mkdir(parents=True, exist_ok=True) + @requires_dependencies(["wikipedia"], extras="wikipedia") + def update_source_metadata(self): + from wikipedia.exceptions import PageError + + try: + page = self.page + except PageError: + self.source_metadata = SourceMetadata( + exists=False, + ) + return + + self.source_metadata = SourceMetadata( + version=page.revision_id, + source_url=page.url, + exists=True, + ) + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): """Fetches the "remote" doc and stores it locally on the filesystem.""" self._create_full_tmp_dir_path() logger.debug(f"Fetching {self} - PID: {os.getpid()}") + self.update_source_metadata() with open(self.filename, "w", encoding="utf8") as f: f.write(self.text)