From 6173362620ebfecf01bbddf3bde62aa49f085922 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Mon, 10 Jul 2023 10:29:08 -0500 Subject: [PATCH] fix: detect list items in MS Word documents (#909) * fix merge conflict * update changelog and version --- CHANGELOG.md | 3 ++- example-docs/example-list-items-multiple.docx | Bin 0 -> 18793 bytes test_unstructured/partition/test_docx.py | 15 +++++++++++++ unstructured/__version__.py | 2 +- unstructured/partition/docx.py | 20 ++++++++++++------ 5 files changed, 32 insertions(+), 8 deletions(-) create mode 100644 example-docs/example-list-items-multiple.docx diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cc3131b8..7c110cdda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.8.1-dev0 +## 0.8.1-dev1 ### Enhancements @@ -7,6 +7,7 @@ ### Fixes * Fixed `auto` strategy detected scanned document as having extractable text and using `fast` strategy, resulting in no output. +* Fix list detection in MS Word documents. ## 0.8.0 diff --git a/example-docs/example-list-items-multiple.docx b/example-docs/example-list-items-multiple.docx new file mode 100644 index 0000000000000000000000000000000000000000..4e054cdebd1a73e90e3faf6a7c39f83f046c68d8 GIT binary patch literal 18793 zcmeIabyOeAvIa^B!QCNfaCevB?(XjH?(Xgug1fszaCe8`1b4UhOU}6`d*|%;)_Q;6 zJ8M<1k^X9?XR52KyQ)S;{5=RV5GW8h5D*YPP^PazRpvV&plUE6AS57gV0C_LD+fa> z2Wh#Q{ujT2xCs>SW@R(YCW%hJ*3*+nF>A2s+a8k~y7xjhgRThobp(e!frMkHg(+gRwj;ps zzxP4k8+Lm4zyG1&iUE50hB@3V)3hHG<`LMm?3Quo1#oVdv$ zMLm3bKtj>}{ElQVCJ2wh0Fq2yiTWDZalOo=q|`N(6_StP0> z?BOBz&b$q2gVdiR*yDYsW18&4VxBIC7om7}xPOqTGJ;i;H+mx|LP7kxxwwKYr+C*R z%c8R_GEMRYuH+_<=j43vD3Ie3&@JEI-UG?}tykm3Vm6)uL{$oK6$;R+we1Wo>}jZf zmjBnA{}*@RzcjrpuG{PzEi})W|BL@ryXgwi~D7-=A?z3zwYVSHvj!&j}--n9Ohp~-s%7~B54f0ROJ=~jFI0wTu* z0zv@16lZHY0~!NseMd{c(D!r9J5Dl>&Sgdp>Qp}TuDa)bpJJXfRz}=NKJA5mV&(1g zUMkza+B{w3@yOhMBrubTu-0r2RglMG{M*q2H<u z1X3T?1_weAgxyifd@C@BLc~2- z>5jYMC$Lg=(Lk($iV^r)!|gzGJ>*LdeOA+X#1w;f)`TAWH=b8Ss>C6!2-q~`Ts{`p zW{5cZ$_LrcHGj>(D}KE!7lrqxTJ#5IAe?bb%fBp2N&2oMlh(POEh1PCveoMk;VWtq z!9Zd~>Y#_riMwzCXW5p-U9oi}{?WJAt5UJ@W9mH~1Ae;7`amnWhZb7WG*kSx#Dj=^ zY+hM8?2b~}p#0ShyMk5j@u zyBr^h1j;vN!mVreMm1WEFHC`|-k%e!Ug=NK_N3_w6&}xL)ETQWWtb^3l5k;|-R;u(4lRrrBpYWm5fR99uiilT2Q4BFsc_qWHZam>rA`VZvjm)?YcWGb_%`O#p!iigjZv|a{{IX)X zsmH8x!Q(0&KUVo{PF=}2=3%89$+#A#BDgtrC61>1S}*vPNq?;s#(|Ft2A5?69!PNB zlld>@Wu9@!642dhwZ8=?mbr4#gll_Zb_8b?YRIcr&VTh54* z73nYi6j)3fMljozc9Rd)`y|sfM=Iaf4U$*kj4cqN@Vp3!sW^ey@(v=Hr21I-NYLtV zj^)z>7kP5T2nT&g+I?e;hQfcisP2Svvw3UXcv~Eu%h45W0L)YP(RBSnGbowdgN9;R&X_LUJu)oWT8oZt6}_Ia|}imxNNF zylG>6l0*+SQk1=s1P=*Wa3P-(FaP2VX?$d%H?UvU$YdAyQXAD1j29@!NmaQ|1b~L zK6h$0`t+3pfpO2+qHfTPf+T5Jb=qrSwO!|~hWd7>YcLfi$ZdTYEV|}1#6^7i5a+a8oFRJzeRs1?MzG9D3r zfyZdE;&aj>Gy|a$l&i*}#A%Fa)bag;VMG0|Sy zJn2)99zEvFvoo?OMN*xxAKu$69Ksw63er7R&nvHvudhxHuwGPpj~|(3wWnz)Y3?Ne zFK~Lt2>~?4^^`HsGSysY+=&WYb{59;o^o975Idpuej4|FMSN0p>P< z+W)(^vV;+-Z?y1&cOuXHJ6yJM*_^X^9|LBoKtjvoC2&495V!;HoMF_9rSTb7pW9XJ zx)|*79L|@RGXsgp>}R-geW`Z^oXOK- zklxVvu9-Bw3bfjLXB8DjM3;ZfKPO)7BUy}?{FN3mZVqTiSPm15I9yVe3ur67u932w z7xV2ouC=D32r0$RKMCf+N1)zizp4h<#z4s5#=3gFA2cj@p_i@dXpED<1 zWb&t*KvIy=dWK2-Fb!aX8jc2O#i>IUNd)LrW-l|RjY4QSOWFiu#`9I4@`~&Fwe?ep z(0a?ILudxI-^CVGb*Y4mF9~pq#+>%V>T2T%?2A~nb9H^(vf8P8Fu)`iFY$+bEj%>(ZX4-U?ni| zdI!PIzGt$$20gu22XC)8>}gL~)hdLn@qW}VNKCKWhfj^coaF$s1Sr>q2On9cRh0(OLr>O!Q$`BE~-0lc>Pb%e9%D(4@<-!prK5)63N32b9! zVB7eKgw^K{1R<B z215|pn)g5}N0RBn(w(0JOx!4EZ?UO#20!qqUnvBnv{>VuMCT4{L|aaq5*qz3tgDpx z+}4gkH>gR%gjtF_Oykj=YHQkC;fdqmVyW}Fl*?q^T2;l$_s5h`zfwnw^zrzqx#(>M zq`Cl%X&Nt9J}sHNqw@y8DQ&7!%(r?VEEgrN^c^=kki?xuw&PW4?FrZT_}Xi?&|sOzwT-=G2SvQ z7np`Nb7Zb|kwcYe?(CWCt;>~V%iAGZJMo!4XZe02g@dDCrEuzsj3GU(Y|0zxIlW!B z=jzw4k5bgaS&0V@_zG!mk$>C0gXn;a3P)~})oW$-zJJ5?<@xdvc%Q{MFgnR`UiHDg z!lqx`tH+=Bnloxoy53Z>(dEn1d_=A}%dB|r+}eVe z=ua7jV=TbfPJmJ^Yy09=>9o+%`ZM7x#?pGh*COCAvD@> z9xH5aRfcIUX)WrlJE5I4Wr@x8WONdyoKG#76zy1u?eQdiPAR^8Th|peDGrbyNklHS znFo_uP#+T4R2Q@{Q;^LAGY^6b69}DDCGo1>SGlCY?<-Is>I6Vw!J4J*wP^T*!Poem`jk`xES*(zPOURlv}+cgI;x&+X0RhT7jWS{oI zqbEr08_Ig``i7eWbTu8^hq*+f$!HVKN0EE?3AIokXe$;RP7ni94``3tOdQ0Bv>_us?W-Y?GPFDWK8MN!;#^u|Uk z5>Ii(YLb{OZ$xcT3b{Qs^WE#tN;NXRvMt-MAVR&_+I=V`FS`2F_yy@I4gn@pZgR(! zA`!omjnwZF9PgX|CCD>)=oZax0=8J|=cBGW^g6T762DKj!Z08sjC8$Z{#`p)_Q;8M zrstW2UMCn4wS_}^kBnJ4(~xJB`{j*u6RkaHT95XjJ+!aPQTuR_ren+o$*r|0(kxyk z4gIAY8p}GfyDG(Pgel|)O{ z$jO~03nQ7z^N>?RG=olg4C{NBUezBvL#tMDfslZ06C@H_w!m}R46C>?)5Y6gT=0X(Xx`pp zPw6^-)Xj*N<~tSLk%lszTa=XezyTd%Eg=^WH5gy%nQ#d4UPG~P(?c>lq|#U9}_VYR32>JUeXz@RNPt3C-xDNT;v z2B@2s?{HNcwLV7<&Ek-u7~9R#g*O)9ybuCorj3xluYU|;*>rBmZ+UJ8nNX-5tqyhN*Rl(t%)13GX`8(jhqChY zqCB%R3ely;66zTz*O2;a8FN8<~1R;9|5#(`mVSg)NHf z&kRR56nq4W*X^E-h^OCUT7Dy&?z-W5B55(22Qe(8xm*tI;zm`0_l}BF z3XgC~J!cHk`8GG3ERLQE`J#!d;?)ItdNKtYf;^N7lmpk*&<}dQjVh}j0<41&=I#BD z`6n5j?`WB#kDOXFt*G{mffnz&w2;yLS6YL32 zdNzZ>B_rcDauP-z1pE-&uHVG-^xZYmF zJ8`|jUfrJ8uE4aLFKR;GUJ56?o^HfhWKeu3I=rq2;)N$V9-b#~aacFhf#Jf=xg$db zSO)2q_vz60;CVo|=13nA3IxN-2@_bm)B1f+ONs(I==z~KY%Jw&H!R;#b$w`KZ&8=K z$3rAY>}v&C4c$VtNw%Ng>oJ{)N>aS;A%$4$=_?9Z8Mq%LYGpwBeo7jsQ9zxP8Imx& zst<366&FC7cXyAE4_m{Lko4Rl?%7R*OYRVy!(1She0`9eASs_?4$u}6nvi^L;vGxv zk;@s)i`MbE^oS+E3~1s#VpeTo!jJ<}k+!2@gGVZY)I@OoJv3|@I%%Xz&&*C3jQmo@t={pxA&`HS^<_~FB| z`ehwdI>r4?j^$!6l=T|_@S-m!gPmsDD^+(}HdwfkUdW}m7iKptl5$TVa5^2Lh9{q* z1Nk$m==LAi2DID^5k#p2=$CqLkx(vqpMqgdCGr`rS zpp#G<><-2P<|u7MO__3++v-*g3K_5BU6Bq!bzjK8HmEh`;LFky_};b}(n5@=L(3Y~ zGZFT@v!z5kF2!)pYGnxzwQE4!-`;`nFOGLy8dE?LA7*e^y_`?KDG2t+XlTPfl%BKsyI+kAppEi#z0og^EpEHykdAckKR`6N1h=3C-&O~2Q0O7M0E@}JDW zLqKLq3HrrwB`&#w)~d~;)D91#8+sBeD;L~4^F)Ck9UW@U-MW*eImC}5fUpXlnaNi$y3;d(g=2(1>P|Ysr=vr(F)nsl;oG-*bJ0 z#IV)t`+u4li|z{uask%c;-UXCsjxS6a4@wp{xvbKQ(m#!W`_5qne?W;)(}h{Oa?!q zv>33e326;?MD;8XSpM`W4p?;Q0`=t%mTam9$vM^{Jvk+V!QO~91A`owo4dNEkw{Lq zdqB$p!HuiO-k0oqM7uQ{l=sd}f)G}&oDKc$t$-_TrT5G3H)=%za~XNA>6#H-$G{in_3=a@=)fx=)yM|=y^>(MdSWfS9YUxmGS{gg zwQW~3WA1e>(#|7zi3Ft*{Gjymp<*t_Z7=o8IbylKS%O?Z`SD6+ zt@qQJu>n_c?tQ(tn`vFOt`k#$ssH=agg z%_xP3v=Y;7_cT$$^ehYUpxuMDAgIt9#2(-2gYJ_%3x_=#!l~y)BG($bc(5}}qh4ox z73*XrfxSsmTAiLrC}^J{%(;9oo+YtxNQDM)KdgSL3^b7(4W#B08CWJxBn@#{b}q_S zcsEc60lrWcRC9brv^0;{0S;Q)zD?mmvGPTBsH_#lC~b?ZiwrUBu~1!x&6c^6)FhoA z1b@aXiLU%E+HogSs)M7m74$_5ffr^|V{C|&(>T3LoeJyxJ0evcdgW`5RGa5wr8MB& zO#e50Kim&IO&8A?kf*xZu^)I_g3R4Xj2q}tiOQ1{#PH8pgIe5Kj2+sE5wT?Qh7gBS z#j0yvJBYx)2pAM-$o2^)W# zadMzymSsKDJT_b>63v9l&`U2WYi){i-(_NOX)D(JjR9Cb0gIrN-9Gz_#YeA*63s>x zwu(%Lte~U?-@KJNS;UHcU5P&{vFG{OVS~OlaqxS7Q7Pz}r~*^|nL-YsDHYkmT>@Kk zNo`{77VTYhcG~uYy6M$yzHzb{DG>+XX)Kj!sd&?yfo||7$D~z_kE-xg@?$At8FU#Hd|2G#M{2b%0lh9u${lm`Z$BuCCQzVR>- zWj<^~;={OKeOIRk7jh{3qBd4wz46WBvLO{>)Au6%FA*cNJ+Bq4)8j~7U?2)GxhpBc zFx=>)^*L$x%AG=QbLL4}KpBST$owB{Y|7pBEX1ExDfdps$P%{ACTdPa+f9IJiv4Wm zyN^l(9Tu%o!{d;{pT{|GMG2yVmE*hT@(wzSynUWvo)J6D7p-Xt{u{_lTSg-b+qSY!aK?Ct;v35XO(-_hQ|+ET^Rg2vRr z(DIl0i}b>g5m9*13q&ys?BxNAzIVvR zl}`?J3e%7vs`{&z??k+i6kV2d?rP7b6cjr2|r$(gN)f z#QktM@@DFnCRU1)4MNAPFdCS%fl;6PkvHGgbK+xDm{PBbm*%A!L7^-skL|+S$BRfB zPwg3DZ0}e8r_XhMLVH3NE$W#Jq7u2sfjP~lG$7w+BolNrHOg%c8i(B8&Qq|1c0$xq zK}a)dT-`gv$8*zH?6UGMnA>|FT5yrxS~@+OKUz3Rn?)`yO_!!SC{v-@hT2y$>EEHb zE$+U02;W2g)3*pTw;kR9yyFDGw?O}|Zy{r6ZDarc?}NnfnghJofoJ}&V7czwdD!|1 z?ClA>mHPggk-3DR>X1ncp!#o~u$o{ey0re5S-mrNldtCp{%!+=SwlSXbJ;Oi&RL-E zD`}*Ugk|EW9GPjqzSCo^D{b{8X&pDBNJYw`7gB`mcrMKKo!HwiOCP&^-D_T@k@+BL z7=ey%pXH<${*g|t*oBXLEK9zwd63cM)cx#y&?-tq)%x&}x3zg=TOE3m+I9TY_;^>1 zdTMXSo6Pv5J-&v|`z!b=J1nLjoojESDYT0kP87Lc3{uOjbooJ+o_5Ffr7_vX_ zR320|(K71Kf%Cf(o%IRqB6QdYCWHR{;Iq!BI_L|mNPLv{9Dcs4O`|!! zblaRzlJSBJAwaFdq5=>ENM5)I2601#q1nxZjLg_04VTs!LOpJhTdAH5MgCYYF)sWA6}hYZq~8-t%(5#`MT$ydIk~2Nldo<> zq%vn`2$~(`vV}{qNfbVB5c;b)NP!Fbgnw}``^58oZz&VV*oS1BQ1V`lma4PI2c*-K z6tR{WWT!t8f(t1Yy}(LCcCaf&9xLhUjQJ}6x++DRo0&nUq@hfu!ny}V8l!Sg4rK*p z23M3(82)P2j5x5kilEyf$sm2@-NCfLewiO-6o!uPwAm_sEsEwzZs|$&Vi1*#pjWdq zDOCb4$#PPUl~}J)P*}hVV4YbcW8EiRt4 zMX`^}Nzg8?T-UOX~T(EXVu6%X= zu40kBZQ0&X5u9DEvoK}bn4N$0^}9x8<(|!+)_q(;ZuQuFsAU7!VGI9$G5iW=!;QDw zr7%}xL}cr&Vx;A^Yg*5=%9+K}#Fy`B%eMk1>OplLoCu?-jGw|0V+B2lG90vkj53p% zFyj+MAweSGRDrpFWWfXyef|-?_%(`8lzOub&wbX!n~7<;_}=YANj~y=pzL3q!vE2cS+Dor@9#5YYqWrisy zu4@IvpL`^PR{)iZ&u>)Ab>&XcF-4nMj#tKWIVeYOKa;W!X>!$!?C)jBSA1)FdNxGrbh|j0jK^i2Nu$ZC@P^(x^Q^aUit>>xWmIgCt1T(hDYsbBH z23&ByISVmA@_x6+uGG%RY~N_SOEQWorn>_8untswS70~?sH}?oCl5cs@~-*wom5z} zc@~}z`%4%nt8#{$yqwegqkc=iIKqg41p;yn1_FWu)cEBuvv+W{_?NS6HdEbxg&Ec7 zyR*^TdtvETxQa8UxZ)MtjJ%6^r`6#lZhk^STS-K6nvUxQ8ml}WY`>j%Q=@rsk%AW< zAmcEsc!JOOSK?_P(u_2ENt&au-mEDf7@<^=jZ4Fq7J{YSt&MnP<0wBtjQr?3PDHK! z;{CY)*~`OAIiV7Q3K(`U%-Ph1PCa>dZ|&6!p~6%w40kce=y`-_L|*PP`oZio(Sv^d z5|nBSzF}KPe3Qr8XLeTBbW;iLpq``=);wGlm3&5u{AVvIBq;;c>L?aLNp~E6cO%!j zAytuZKWBDHWEvvdc^z13AkxTh87LO2uU#Cu>V&cy&*wa^iNxw%WUw|MK*so<-E`|F z{oKinY_|)=X7fK^` z4PtE&8{N8K&_t^4Kc98p3%`nL@7PWSE=hBuenR!i7MCM3#OL0-Ykn>IJmk$?T}J7* z@0L=vYl8YOi| zpqztIt`9M!bVHYR;+ff~K7*y!i(vh3=x#5W&qn)B>c#xpb;d)^_=GeBi~?@0c^L0m zy`@celOS@B=xs~S36HuD&a1s+LO(w5HsNeMS&hVd0k7-1{|sn!u|>hnC~liH+- z`*Wi-{q&7*j3UO{)BUpqCKXvuG!6cmNF-P1h{!OFCkY6mtrLnh4RzmJnQpL}vF4?@ zn7}18tTRH*18Hw;3~9u=6GLfc$4IRV8vCmf8v6r)UI6q2pf@Enx2tk$LOSpLz%9xx zSG6oSn)gNgSmKO1Y)Lx&_LS8Ddn$Y%#;OH?)AVf}7zefnOrrWE5i;TenQ0g^UKY1b zURF2hdLlVE37c$#g@IU3ZF-!yVfUQ@JM4fRV{5=pO~3t|0!7oFZc<`=uEmf_YIPha zk!PwD0fupLa~)b1`nAbvemo8>dF_F(3q2L#=uk8Vc%eQ2r{ zhREo3Pghkc4T@>5lk*;Z1~abGwcR0>V*QAc z@Cro(Wc7ny;$ z`&>7vpj)bOIPaw-j04#`4LU3-3Rx^h(r}boRfo*47^nFukWjfFMZbN~W?|enT-r96 zCP^YZP8E6$xuQ27UKow}97l|Sa)~=LmWEtimZ%nT!SDHU&0{0Z9M9qKC5H7%zcJdK zEHu9BJM`8SV7NK&V$K;w4!d~oF)tgz{2gYi5)+yglM;+#Q9dCT*Osm&qNQBHVGpERwC%bYz-?O9)jK+85W?3)+uy zZEta$s7HJqeuv#qqbkglC4FxutXy+z)mYvryTPaN&{Gn`3P9;!sI$SUvKO{+lNPwKZ~*` zKi4#1bXsk%J;(byhvWgDhf4;Ct6gKny^#sR(c~}K-rzs%hDf?!gV5g}jO;__b@~!h zn6~J$4QX@XebZ)!OLluz)3u}N$e=72Uk7=I~pwsXAQB!`)Zn?EmO3P zP+C~5$2<#0+{$=}YUnC9?mW#}NNkwZQuMa(8(7?DC-xrKFK>)YS$z-VPi;jAr)?Hr z2${UAs<3>`F<5Io=Aa(IUI4MvHk zUAm3UAmN>jh+Y?sKB)K2ws;-|&D`TUYF>rPD@|s{+jGA5D^iRs-mtQ-Eh6MzMPbvi zzgt4crH@s_N{l;Bm}(x1LeI=Q26t%iBLJpW=XKR+@Vlg5f|S#!nF~<$!KQO#*AdQs zkMe2KmtD62unUY5!tsGT?_Z}N5q|IlBGGDO^SS<{WS&6epX89u2f{9c4~$&|?+dFC z!UI;ay}C|*id+=q_v?^cAp5=II2s8EdBVU4@&soH1rb{S@q+!l@bgx`_=5jrLoVan zB%jR(`J3I1JpUa};AInlsC^;HeiGmzepUWmDTL<(IlMFEUwM*Sqe_7*kVZo(ke(qH z#cToS_tX5osX{ieZQ<`WGLl=g`8V5Fmf5FJ$dF|HzFtV+O; zIREDo6CrCVrp~G)jO=(p5GjkiKRJe_nMhX}CSp}V5D1C<`-oP+jh1HN=Zwvo!=~w( zhI@iotq_gCw?GZ};gd69c!SM`Vw}p7p#AjgK`a$lu#~A%g=&%d)h49S&4oDL8A?-t zTB~qCTIzBDp0~-Yc;w$ri7ZVjE9h4gZ~sr>B)?sfPyMgr-wguPU-$|7Sqxd3Y1Gde z{w@wf-`Ai3S`0XA4k!#*m@3o`>Hk*Lq+u#k-TbFmk_6nfMdEM89}m%}L3n!(S>!5q zNq>IV(KL~#l5Fre0?}#2Mx}BoklSUgY%SBi>TPA|O|)o7!R6&O|@70 zzRa!@0nJj(hfUPyX~qc!2VyPYC0mb=E;o(r^(jeH>>haz_5*Bca1P%*!^n}44~08( zMz*+26bRWK-05m4O~qs3mFXm$r7xVJd9wB@r`Epd zG>03B0z#EM8XAcyI60vfG@P0-QgywS5>m9)a}1p7kfBC2M137_BsQv@aMMQsKNrW)Xp9G4MVsKXa3rVl z5~pY~AL66!?58Bl{7wrKW^YL}iFL0PRH>6~H;RK>UfEO(;Wj7f_Um?6SdYf!!=n@X zt%h~-5z2}H&scp;>FvYdTxWGf+=2V*MKqp{Wz&RC*s(SXoaM$;)}bAHfNPKSvxUMY zwtFa4GxosUXHBcw_s?xhenuiIH*tcwEo#bQ_Ieik<($dk+FRew602B?#f(Z2SI);X zS|yt}PMF$(-18~gzigfM9eSb$U(amX-&{tCM4Gh{srw~F*lXRU&dTJ!yfXsc~h zMm1w$o99Y|Uf4b+mK#U5Cd>~_>DHF^QqEvjQ)am3c8jaNm=1(1IIj3tj`D2M zvIS+Ck+o_{yxwbViJj|JdSn_FPcn+QA9s|u%R)u>Nd~1YhUNG*TEMHaDlf_xGjD)} z%l}@Od){p;qXt|N2P{A%|IftL*-%gZFCMLosWGb_T4?kIQPjrCUmP5u2e_zw{LvhP;RlIU* zb5VlvQHZFucD$fAWB)-;BjrBCPtlO6Er;6;yf``;0V^E6r~yW^{uZIU&m=7bGo%UU z%!{kiqKU3HnPX5LJGM$po_a4%R@(NS@=SUCno?AW@(vvJ16R|>0^tfN7X>E!qfbO!vQfQ*1}{`t_BYuwH<7~zP+&bd>OTG zBAI9;C((A!dJc~wKrBH?vR?UcZL))JStVja{UOsuy`+64{O)X=Wm^F<7hx35ZFH!2 zB!aBJqf%6eZGNC88yHl+n3eflk;*e_?o82EfRLt27b=ON_9hn(-3>*EEAZ1y0)iXW zP?)^#s)zOzR)`!RAE2ecGK+6TBoSE*v4m-M^eS$E4v9ko{mhq@K+G&PO`o)2c<`%^ z8+sE(7K;z>XP!DRbVRp>Xu1+~?okVSmFZ}PRa=$mEvzI7sBENogLYEc@>1)4J%5I> zTISeWakHv#md$e^@fZ8~F_1KNc)SY;jIeA~p23eN!L51D)K=o>)Z`S|V=6kZ4A%Lv zCP9HhwN%=J8m;O!R#v*`r)t|CTbi30RLEBz976PEjP)Qas33-{eHpte>|x46Nf&+? zadVpi1ulsnIku8RkBP1$sfz4wc^FDmmJ=s3Y)hu7*0R6E>0f+uP2^jOn4C+lG;!KR zq^OEX3mS_){p@B?m}VBhXkVXmWyuQ{FKaZKZQjxkOhb2FQCp7H5=OzLWljW_zLr>a zvqH-X0m=(q30|@FE$)S&ndyhoiz2LPLx69h76{TA=xm5pfqZ2|KsJ$pl(2uaBH=Qb zi;|!erle&6Yt$2F64n;0WjU_=4j2}6zWR47kl+fr~ z+x-%LKrZM1WiJ5KZa|!-_&0zOIQ#I%KjfL7cN0G8lwA|P$zzk_f)-axwU~z=dUSgx zd8CvH+Xj7RkF%bUmto-;Z_(vCmj*c%kqr`Il22a>$wgCe03+tHA#_>5B=Z&9g|T>I zLQFT+1e>#E-VX}W15vV{L6@QqwwpepUB!QoJ3JtCv&Jzk*u{auqS+<4r<0aACKjIdCV< zWXC?A{#WGAaont+(t_dZRz#meug@@dx;VHtUkgr#C}zc9k7UpxGz~<<~BI|)94IwRkpD#WDg(vb1bS* zZ22t3Glv6}2>reqRg~2Aa*%vhs|(FU>xo3iRHZ@?x|a%259Ti4?`Tx9y|9DmZ1phgg{SD@3`#1QHneCq({+zA-=8(ny zZw`N(yZwp(yS@7x4H#{?fPntX`u!9BchlvsaCn}-!2fN?{E7biVDeXV8{c2Q;6Dv5 YGU6Wq-Tmhj6%i;F5U0~4@bl>Z0paTWPyhe` literal 0 HcmV?d00001 diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index e2edd66fd..efe90b6fd 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -171,6 +171,21 @@ def test_partition_docx_includes_page_breaks(filename="example-docs/handbook-1p. assert element.metadata.filename == "handbook-1p.docx" +def test_partition_docx_detects_lists(filename="example-docs/example-list-items-multiple.docx"): + elements = partition_docx(filename=filename) + list_elements = [] + narrative_elements = [] + for element in elements: + if isinstance(element, ListItem): + list_elements.append(element) + else: + narrative_elements.append(element) + assert elements[-1] == ListItem( + "This is simply dummy text of the printing and typesetting industry.", + ) + assert len(list_elements) == 10 + + def test_partition_docx_from_filename_exclude_metadata(filename="example-docs/handbook-1p.docx"): elements = partition_docx(filename=filename, include_metadata=False) assert elements[0].metadata.filetype is None diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b6b9dad42..22c7b2404 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.8.1-dev0" # pragma: no cover +__version__ = "0.8.1-dev1" # pragma: no cover diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 6553b6604..500dd27aa 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -148,8 +148,8 @@ def partition_docx( document_contains_pagebreaks = _element_contains_pagebreak(document._element) page_number = 1 if document_contains_pagebreaks else None - section = 0 + is_list = False for element_item in document.element.body: if element_item.tag.endswith("tbl"): table = document.tables[table_index] @@ -165,14 +165,17 @@ def partition_docx( elements.append(element) table_index += 1 elif element_item.tag.endswith("p"): + if "" in element_item.xml: + is_list = True paragraph = docx.text.paragraph.Paragraph(element_item, document) - para_element: Optional[Text] = _paragraph_to_element(paragraph) + para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list) if para_element is not None: para_element.metadata = ElementMetadata( filename=metadata_filename, page_number=page_number, ) elements.append(para_element) + is_list = False elif element_item.tag.endswith("sectPr"): if len(headers_and_footers) > section: footers = headers_and_footers[section][1] @@ -191,7 +194,10 @@ def partition_docx( return elements -def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[Text]: +def _paragraph_to_element( + paragraph: docx.text.paragraph.Paragraph, + is_list=False, +) -> Optional[Text]: """Converts a docx Paragraph object into the appropriate unstructured document element. If the paragraph style is "Normal" or unknown, we try to predict the element type from the raw text.""" @@ -205,7 +211,9 @@ def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[ # NOTE(robinson) - The "Normal" style name will return None since it's in the mapping. # Unknown style names will also return None - if element_class is None: + if is_list: + return _text_to_element(text, is_list) + elif element_class is None: return _text_to_element(text) else: return element_class(text) @@ -227,9 +235,9 @@ def _element_contains_pagebreak(element) -> bool: return False -def _text_to_element(text: str) -> Optional[Text]: +def _text_to_element(text: str, is_list=False) -> Optional[Text]: """Converts raw text into an unstructured Text element.""" - if is_bulleted_text(text): + if is_bulleted_text(text) or is_list: clean_text = clean_bullets(text).strip() return ListItem(text=clean_bullets(text)) if clean_text else None