John 9500d04791
detect document language across all partitioners (#1627)
### Summary
Closes #1534 and #1535
Detects document language using `langdetect` package. 
Creates new kwargs for user to set the document language (`languages`)
or detect the language at the element level instead of the default
document level (`detect_language_per_element`)

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
Co-authored-by: Austin Walker <austin@unstructured.io>
2023-10-11 01:47:56 +00:00

32 lines
1.2 KiB
JSON

[
{
"type": "NarrativeText",
"element_id": "007ec3bff83ee17497e490b86a36e0dd",
"metadata": {
"data_source": {
"url": "https://outlook.office365.com/owa/?ItemID=AAMkAGE2MmEwNzFlLWVjYzAtNDNhZS04ZGM1LTFjYmMzZDhiMmI0MABGAAAAAADc1MfJYetSQ6QZntYrI9k4BwDZYn%2FlfnvLSqIcW%2FYsN8ebAAATzq5sAADZYn%2FlfnvLSqIcW%2FYsN8ebAAATzrolAAA%3D&exvsurl=1&viewmodel=ReadMessageItem",
"version": "CQAAABYAAADZYn/lfnvLSqIcW/YsN8ebAAATxicu",
"record_locator": {
"message_id": "AAMkAGE2MmEwNzFlLWVjYzAtNDNhZS04ZGM1LTFjYmMzZDhiMmI0MABGAAAAAADc1MfJYetSQ6QZntYrI9k4BwDZYn-lfnvLSqIcW-YsN8ebAAATzq5sAADZYn-lfnvLSqIcW-YsN8ebAAATzrolAAA=",
"user_email": "devops@unstructuredio.onmicrosoft.com"
},
"date_created": "2023-07-10T03:39:04",
"date_modified": "2023-07-15T22:36:12"
},
"filename": "4a16a411f162ebbb.eml",
"last_modified": "2023-07-09T20:38:47-07:00",
"filetype": "message/rfc822",
"languages": [
"eng"
],
"sent_from": [
"David Potter <potterdavidm@gmail.com>"
],
"sent_to": [
"devops@unstructuredio.onmicrosoft.com"
],
"subject": "message for subfolder"
},
"text": "this is a message for the subfolder"
}
]