From f048695a5583c84c1bc9988063c0eb8f311d4af6 Mon Sep 17 00:00:00 2001 From: Filip Knefel <158048836+ds-filipknefel@users.noreply.github.com> Date: Wed, 14 Feb 2024 18:48:38 +0100 Subject: [PATCH] feat: include text from shapes in docx (#2510) Reported bug: Text from docx shapes is not included in the `partition` output. Fix: Extend docx partition to search for text tags nested inside structures responsible for creating the shape. --------- Co-authored-by: Filip Knefel --- CHANGELOG.md | 2 ++ example-docs/docx-shapes.docx | Bin 0 -> 5950 bytes test_unstructured/partition/docx/test_docx.py | 14 ++++++++++++++ unstructured/partition/docx.py | 7 ++++++- 4 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 example-docs/docx-shapes.docx diff --git a/CHANGELOG.md b/CHANGELOG.md index b34f618d0..9964879c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ * **Add .heic file partitioning** .heic image files were previously unsupported and are now supported though partition_image() * **Add the ability to specify an alternate OCR** implementation by implementing an `OCRAgent` interface and specify it using `OCR_AGENT` environment variable. * **Add Vectara destination connector** Adds support for writing partitioned documents into a Vectara index. +* **Add ability to detect text in .docx inline shapes** extensions of docx partition, extracts text from inline shapes and includes them in paragraph's text ### Fixes @@ -41,6 +42,7 @@ * **Add title to Vectara upload - was not separated out from initial connector ** * **Fix change OpenSearch port to fix potential conflict with Elasticsearch in ingest test ** + ## 0.12.3 ### Enhancements diff --git a/example-docs/docx-shapes.docx b/example-docs/docx-shapes.docx new file mode 100644 index 0000000000000000000000000000000000000000..81ff7f2d4f7d9ef9950390bc3ec8e00bd01befe9 GIT binary patch literal 5950 zcmaJ_1z3}9+orp_yQI58N}ACKgGq-7jBccr5@}@8I64(j8I80cC?SlN5`l@7^gsMU zUjOfXuN}{h-N$x5d+zg!`?^j89drx|6dW8Jln`|dbCer~kNj-y?dawsBzXN^lLpsD z7a@6sIHR!Lh0U0erj8t`$1KE8VAG_{pi5$N8P|ZpJPCpsrbXFWmp%Qukdbl3B%c?P zy>NY{s{w_Zul**WJc23fb^U$n5%L|w1P)^Rp~^rNN#VE`ay73Q2kdL@iVg0y3@||% z-cIB0;2*u1_6sXHamH#)*KP8{3jAhP=#PlMOk;$3G_Dpa1mby@s)UgF@|3$f0J#KN z`PtV#M$VPbZ#zh+?3FjgtwbuG>NqFpY+ST>l+Kq{v@5fb9*u!Ja-s z_F!*E!9aJnCq^LsXkm)55CX!VClr4A ze^4mfcqoz2T={V(`eQ2ks*CC!z%WoLD!Fi6zAZk7quT6y zu`BM7=UB1?8ALdzutp@!rh0~ow^E^sBx`P>vbayh)JAKDS^(v7`|Tb|8i=u~*MrTW zand|~A4%X)9=CQJWwEBH>gf_@R_T=g^_#K(I!37NzZt& z)gU3GY1v%3ddtXp2pJ|vf}lDbLCX-{QR_v>*G4|s?&b$=XFoBVPE*xw?0fWZ!-8?H z&ifDOuf62qG?VCzih=_FM=z1w^pdTo=e3jaQinZySV-<1U*O?8(%Wy?YLIemseF2VpV4^6)PFkDm5KDJ!$wN(fGc%%dAW0K3tUfMNPe89>-(q zkq-hqPo1^WY~KVMpuL+QP?{j0roOT+A8Umbz_X>u12ZzlAdi9yF|IP9N7c6+@x}3X zlv6b9=>~I%>5D8+9G7<#(KhzvN59NC@J*fspIbThyOBOY>;E1DUSj%?KKRK((oU8= zkl@?E*~3Y3P@k6caVPlcwmD{UDaJ9Tj0<1XDDE!b4=b_ZH^e~@K`M-=2Lp1N)*bYH za#NzrW$FMLCkw}%3dRvwd^DcjPyAE#Yr;7U-=2#h31|IB!pUzEZtv&g3wHm-`Dg=P zw=hNQ@_ZT24BP}e^-EcpprsLaXUmc)e)k;5Y5=K`2D#J%L9Go1Q*U&~V{5!uM6 z-a}r0pY9CEZv(*I4nqGbi^$CPb9eOcz24iuN@S`Ya)TtX$N6mU7fMY@QcFG5Joye$ zMDm)uByk6Mf~fIKVRYv*AI5g}{qvt-a1w)>L^eyzwN!{(2rq@FylKXFzkYPI(R zjI3~L{3Cnm$uSz$JbHr+1dc>Gri$XzXHV}L@rU)~nQ+gr6hOa5o#|sypmV`Z(avj! zA~K~OchtLs9>mgDjVjR5r7lH!Fh0`EXjdltUYF!RUfWq4QB;mEu5 zy4y;{&(=1m^YhALwMMa*sRFL)I1v)=>_u*t{2$$XUHyL1ag&H0poI@ylH?(xuCrb& zh__djT2Ev2g;$;fm!5d@^JyZCC^owk8zlpc@Gxo=Ek?zazI<`ODmE#LKQ%0I{%L`( zLU_ZF4A8(r#85Z4a6>MRN{yrXnSO4E;D@;Z<;?CoQEz!R-&uSb3iA3fyziKcW3IpAMSOb076VMT(4> zaHh|P%3R5ipd!Rmc`X^UC2(fLOV=8K-P!QeeKU`s0#LRJ7nCB#jFN+G!+(+CRk0ZP z&tn{O_*e0vv+Z2X*11{l=~QFbn+9``*v#8;bScO-EuW3h_v3Al!)ihyPP{XN zEUvt=tULx)9=S_-j^@-U5c0G_JR84hokHLX<~sRC8-OuDo_&ODYTn z{4jw)&QQwe?ZCRjj7g7ecfw;`)#Q8LcN`-r!#MFU^}rtOLl_*Cd7nN=MDMVKjhzBv z_i1RdMEHE4@z`dzOjwANEW*ZN9=^32>an#4RB_cP=jY4P48)9TQ{>zX$t9g!_%RKA#K)%0Wkc?f=aJeo5D_Y&M z6wuIJp0zE)qTG)Ly_e~)v>+r{NFy&-rA^<>oB6gq>Q19*yt!0-r)&)fa8|Uq<|{9X zwxABn$PCuIlYC&aqN&*!k0-QXI;82LUQ>Oq4#Rnn_RE~^YUojdjstHn0~3_Py?KW8 z3np+jy+fztHVxt4QcHUo;WK+T9~<=+<(fSoNcVW%HqkvF4IyY@N0W<~gR84G6!u`E z@KP=^7uB1ltl583wFy$AtJB@|?HSBqRaXThnU=W|ib}w;W+zjFUksGtd=Shx<%SXy zLCgD^#DSK)D|VaWeZk?mzspH++M^U_D@Hg|fJO4g!Hg9nA|law<>)AF*&T|jPUl>t zbpkJbsW(StBKNEb-6};_DV>~CZ=7OTZq_xBLp4DV4UYyWJp_A*)xaR$;Xb zFZ%KCqL#loQd`O8#4S^A^y`eJGfT2z>H#E=<|N}X=$A*Wt`*nvEGJpz1WvSIZ0H4` zOMmTDjg!|_f4G=2&(=I%B6*9Dw@^QdQg5>}AVF<^Eyji=;8wj+Ri#2+yz1SB`Am2a z-nlt*Pze9}z=&e=j~c>Hcng5&K&c0Ag+KQ z;HTDAr#{j7YNtLEElv;m?S>`qC2s78KQQ1w&Q`efw{NQv+i5o^jwPwQ8?0bq6l=i- zt56Q>0Lr>mIH=#*#4xQim7}Uz)5eW`uVDRxZXz;$L5eZ~xqF6x+UI-zE}kF%8~-jT zuDL7hOs_12>kiQ&p?NwrfN!5wkFOFCCDJ+L;u7KS^EwZxD(7^2H%9<|Rkma4AKf+3 z@#%Yin2HRIT1m12)GePw+W&mz+ z%1{Mr%wk;QBNMkMzdK)7TZ#rfXb7I3K5+*<%C>tj&S-5p{$o6w8X*oB2!^$Lc!-nk zihSK8N>{)At;!6f$CxQOc_6URllcPM%Y;rmg@7&y9QeWQm@X$w(8bK#tXw^h+9wSo0hKBa*r1!n@8Ed z$aibo1{!cl8N2YXO@2nzhrg#F>8EHH`y7+KpBQd~$;7Yh`68V2O;QnQwBf{4VC3zF z(aK#cfeB_8+WUhJa(UBu-J&Vzrxe`t7^jZxS$GYk=)6$zE3w28GsL&^k1ddO8^xj+ zKNIU$@f8#xKj^c&dL*YVfSgf<125R<+S+?Tba0S>MU)#;;|#^LRg*PNsV*twEASi} zZ5H*>b8SmScc!)@>ez+#*Dc&vXx9fQ*MXc88a4{bcfNn>Cy4)Zfcp3bxjFh=_Yu9O zAn+`5cv!59I|rcS_))q7{Y=W;D4(D1p(pvMw#qg;%RvvO5wjfpoNCbd#&)GpkKMzz z4Flrz|UM2y;$VUAG|2ZsY;R&GI3;zmIfMr_3;fm9h`c=p4WviSO0UV21|b9%bCj$21O zfU9LL{Etk$SCj_feoY5;drv$jR8B#=`;TI{?|upRd431_4KOW_vKhp_{8VgKk|r6V zPD5}8Hp|n7vj$VnOfoD-bAWMzUTO@acZ}N)FW0g#C50ZCYHcMt=u|9`d1!|#V27LG z_@ACU^JQr5a$#OXFm6FCr|Ebi&YXhom-Okq);|`^T!3%IxFS-`UQ1lkorG9fTR)lB zJZwBqN3C5XXYm1 z-%4p*q?Qa6`U2i=2*qy#reSLek8bZ~Z?ro!)@7`_g77C(&Qg;u*Xtb?|28;_r^UxjkeppIg z!Qn|8?9TAg+>VTS0o4z;C9*ScNG77jAr%`DUC923vQL{!)v_Q^JVy}#-LmSU4QZW- z8LcJrq&%4^nlR74+iF+J)wstABcG9^Iz9zkrfk|HiIf-?k{}bu| zZP9%keSPnHIRC0Kjt0Yk9wma-eai8Lucex;#if+l3Fx^%#m;%@9qpMz*B$YUx_+&4 zr4^?TCwZni$$}!j(PuNp)g#`95VVq#zJ7am#UnVJ+qhoIt*|7n=`G9iky9rLlO(pG z6%{G9+DEgwYDSI%FmkLVOwOc)rVRAB<`Wm}A-$p16boq4s{qFn8}y$pQ!PIdc_>sf zh2@fpH!`3N>|-|gVca3o+c9ZIqhpin4WaqAo3ZzVzQ&NM`B-@kDvV-iutuJn!A|@+ z)o~!M`KQr{R;z^yq<#{(kbio@gyqk%5aG_pD8VJRN;N*VVc$ z1IeWDwTh51V43lzUI~YDC#s*Uh3A38ww7xg^UL{teouABG%P4L6obGm(LbbzB*)lkfI8WYMZ5YAdPB+PV8gytc%$z+e5_x z4G{&k#aVmAW~gm+PwQ?;$wm;}k7II3TT~s)^3!CR3Y_yc1f;>5g^KaIt7|x&QHRCN z?$?bf?;PXY1#Z94fjQ9fK4xz`sZ)dr;9R zP=1S*H#NFzvGT8WL%RGEcvCjI&XC{6j{K|tC+PhNzbS%TYfHav5LpHP3;(y$^e6hJ z)NmbEzfBh72Ku*b^5+e14(Hd3#BU=*lJrmd#GiM%*`@rI2Ue{AG1~tLzghRMTbbWx zNBD2}@5bg&@Xh*leKhaSiPKra7B XsvGEFA~``p!9%{nkZ!J`xL*AqL text is written inside inline shapes -- + partitioned_doc = partition_docx(example_doc_path("docx-shapes.docx")) + assert [element.text for element in partitioned_doc] == [ + "Paragraph with single within.", + "Paragraph with and within.", + # -- text "" in floating shape is ignored -- + "Paragraph with floating shape attached.", + ] + + # -- module-level fixtures ----------------------------------------------------------------------- diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index cda12787a..d24c6f311 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -330,7 +330,12 @@ class _DocxPartitioner: does not contribute to the document-element stream and will not cause an element to be emitted. """ - text = paragraph.text + text = "".join( + e.text + for e in paragraph._p.xpath( + "w:r | w:hyperlink | w:r/descendant::wp:inline[ancestor::w:drawing][1]//w:r" + ) + ) # NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and # do not contribute to the document-element stream.