From d06bcc41bb27ae00a76530b34c26d7f1dd5c9358 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Thu, 9 Nov 2023 12:34:30 -0800 Subject: [PATCH] fix(docx): improve page-break detection (#2036) Page breaks are reliably indicated by `w:lastRenderedPageBreak` elements present in the document XML. Page breaks are NOT reliably indicated by "hard" page-breaks inserted by the author and when present are redundant to a `w:lastRenderedPageBreak` element so cause over-counting if used. Use rendered page-breaks only. --- CHANGELOG.md | 1 + .../handbook-1p-no-rendered-page-breaks.docx | Bin 0 -> 10182 bytes example-docs/handbook-1p.docx | Bin 10308 -> 10203 bytes test_unstructured/partition/docx/test_docx.py | 44 ++++++++++++------ .../box/handbook-1p.docx.json | 36 +++++--------- .../dropbox/handbook-1p.docx.json | 36 +++++--------- unstructured/partition/docx.py | 40 +++++++--------- 7 files changed, 74 insertions(+), 83 deletions(-) create mode 100644 example-docs/handbook-1p-no-rendered-page-breaks.docx diff --git a/CHANGELOG.md b/CHANGELOG.md index da504fa79..f190dcee2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ * **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api. * **Support tables in section-less DOCX.** Generalize solution for MS Chat Transcripts exported as DOCX by including tables in the partitioned output when present. +* **Improve DOCX page-break detection.** DOCX page breaks are reliably indicated by `w:lastRenderedPageBreak` elements present in the document XML. Page breaks are NOT reliably indicated by "hard" page-breaks inserted by the author and when present are redundant to a `w:lastRenderedPageBreak` element so cause over-counting if used. Use rendered page-breaks only. ## 0.10.29 diff --git a/example-docs/handbook-1p-no-rendered-page-breaks.docx b/example-docs/handbook-1p-no-rendered-page-breaks.docx new file mode 100644 index 0000000000000000000000000000000000000000..fec961ac4345e84b3461998c0bc59b4c46673caa GIT binary patch literal 10182 zcmZ`T zGvE2lA8RvbpS}02v({0TgMq~X01y!Y@dBfoQhl=;`%nPD7(4)g2><}x>>SP5t;~(h z%pEz|+--n)@!-f54$O}knMb}}8GxPo;_;Za`9g#u{0TBLBNLL!wb(xR&#$}{_-tZk zLY3JnPof^}`Yw8wgzuuM3JG+|HDuS!C7|;QM;nLKPHs-Te@4L0`VEO#^{P6WCA=&Z zUY?VQHeMYWqKnDTwI8K1wtCx1GoQyRf4ypu1H0v8LMHn~{4Jik98Ba<zR3r#^`*|(OynC<&?wOO<_n1T%(R$>LY!8k#4)MzBb0L}WTu%Rme;wi?G)xDj)YA-=_nk)+#ipxZ~dBiOj}k)vf8c@ z`V4JhN?#P)e>p~M){9PB*#F~r)zlU2xOo=o=pw-De@`S2qw7{rch1O2i^ zs``1rl<=&l$M3>Ru6@{Zklce;E0`jGkTAm(o6w|ktUj1!1-K= zs0E>Bc&TAV06t5ZDa+N4u()6uM#NyZ0jXVPh@CB4b1eCS+`1eDURSC5+wLVOreK6!}m zJ|oJS&9L@$i1WR; z<`_D!GqeM-j`O&fi}RrRGnyc!VVk{;vcoAri(8!gy7X6RjOsJXIHx-e-54vc#x$py zRlKE5b}R^`assywab^bvZ_qev_f%RpItC#{x@td~dwSF|H(cD0qJXYWD-B4J zRySOMg_>g3+i$g^qGj>!@>bWXqW#O&sf+IjVEzU?2CJ~fKC;h9tMd5iRjuV3mwO|s z(7_w@czZiDbUu{6%MhqQ@&(N0B^NsMdGvX-i;Tp%)vy&wntduk!Kk47eS55qt(%CM zkv9D|-;`WU5Jym-hggr%R>2o`D57!=S6Z00Ol^W%8T18bT>M8LcMcwd>YrAG47RwB z6{I4B-~a%uf6oeA7aJ3E$G5hYzo$j4f?_uZW{}bsQmP{`4K4-F^8iOh zI-%Klne79IlnzyzxQ`ke;VYMY>s#j+a)X(hR&d*KJ;NYaJ??=(Hr12spVjVEelF^k z>i(OI{El+*3*x&S=qk($N7i3)8c^S^ZGk?Jl;><{furr8Y<(^PG#Nz#iH=&{TP43;5H1or2Pszfs z1edq`wPE-K!b|Yl=f3u*Xn%KVWKDN295ev13+dEn|K1lTb7yDBt~mW`h>zHUIZ*pE zjE*q3)30)5cnMrQw5tv((O|cSj7E6>x_ky&MKSop+ zJ9!M4IClr-e>R<+!(B^37jj&hP{-!r_wB zTy&lA2chP#>>GVaUD1MaJ3F;$wW!Iz`nZ~-XPxiYir`1g!3MYzf-OIejya-G;y~jr zvWz%I3*jh41M%fvs67G`zJ(9BLn%;~JqRx<<>-+1dwXLXj1VW;yh0d&G1}6i$il5% zYU?sSiZgKTMY_ZB8BvQn+cX~To%%4J)X5H}@QI_iOL3~hG{nhjD|!z6;NU&9!9@S! z_7$kzcUF8+JS`qn)9mnO%dxJ&EZCC1-1LolQv5e(_& z$ba^-g`KUly0Hn+{CCmB4gcz8$qrv1P^(=ml?cPTML0y>efNEYewormGV6Vwvm=MM zWO)>jUdB;6Y+C;rk+_FZmR&|cKVb&sRMMtF?brLvWm>_GQKO5;k3okJBxMGqC(*57 zOHk0s{KTf8GCA8328B*^P}Vz0OtF~GeET#enQHiKYg!&S+MUtS)O%k~qdp+v0Ony~Gfi4Asa77!eJiuF*5| zF>y{J29CY}voM?tB12D+7!#GTwkdNDOJntE=pEjde+l)|e2BV`)&^4^YN=Vh_E}xK zdHKa0LA&nKa5@*>8Sya(u_o@z8hQK+QU0VACU(`}VV=_d1M}t^*uS?eO>*}H1_l7Q zg7i7Ye_Gx7J<$Ajqt7JT*e!Bk_6v8m!wlW>Qs&}FqG^y%+fGg^SYJX#_F$s)*8>rT zUfu7%6ctNS9iSN^K2AIG*~%CTX=?IT8ZZh9>u7eX?unwGbeK~FW!uGOk6|r!0nh981K!DKT{M} zhu4cp%zY^g=SKNvG3` z^mRr3CFOAyCR`11GU{i26zYs3kt!4{dfD4h+sMThk$7o~O<6d`+KAHi3QWSaHR1&m zY!#gLwHmsq`4=qOLk$Oqq&f9Y(7{^d*!FUMMIQiCEM5{$(jhSx3?D*Gw8QszB9?hh zHyJW&t?Mu2YUmG)4ocAG`ojY_1bfa;^XqP>%i{xVf4qo^T=QC23^v^0v$2kN`DecQM0HVgf_QqXEeU$3x4; zRbk$X_1PM`pc=BM=|g1EhqRdk61;5#3hi&y-e$#)sY}fKf%(OC7*0WFDOTG87D6>Z zN2B+#KG>@=lM+NSa1@5Q6*pP|%Y2_ub zKH;9>U@I8?bDLs)!qQ~_V!Yp0j=aVDtjg_EmD{)QHUZNLggyh5Ur#LrLPKw+K3k0N zWxQX{egwiOVO-M!gg!+S(=D7nJMA60k#8=vHR;cn9_@1)ZFg4pVg;_Arxj^_iC-@z zB=^k1R0?N`B{l;&7zt6|yCDaJpqtx(M2ZDNvMj=n3YS8L-gQUB2ZnQ~EcPvI#i5v7 z%PBrW|J^UB>*l_*kUoinj5-3yyQzzlvz?8$f;{_MXLB26M>~6`-^ZM^Po3|&IZ!`B zej$yV?NXgh!&M6sxwh+ol_d^wJx02ns(nNAEn``G#{Byney}m)1%`{{uS}GWk`~ms z#?_xQ$VUofz{6CqkEPHnv+%m`DvQD3oAK4U-A1~l;SP%tD*3D6Dl$Fi$nu!d?-`KHQt+}pJsA==j4odM6 zZP{*Oduck>F%|T}^3l+kFd+Vfl0eNBnpIrP-LwvNpoj(UbkWc3W%+5WmLq?xT5_w8W639QrO@R?k{5GbZ z;cyOPK`X%=zh&QZ)bl3Qm7`SB_#kDxLR8)_XpS|}1@bN55DT6_@_#06}+d6Pd8O0IlceH$d?o=W&|WVv`R#IpM_Sx6E_K_j4R z+v?bX%YFxAkCXwA7@XjivfZvn$*Cbs^9wan1ej|+p3IxqT}nWr*GI=GOV5qse3)ums+F&hh8fk~ zs<(|`heELv7?k8Q(QTK{*gd)^{k>}2Ah^VF$ZFp}R{e{An%SBD?ksjwJICKt!sZi% zN{FETrVSt6Cj-4oG7oP$l?U_GQyL6P43-yd!Z{U)caM5pKf$lyO;`jQjO@V%JXx8 z*y{@h1xhib71N7C^RLu2xfmpbyf&T)>A0*+U6R@bGgcqs5;VLf2oZPU}&j`7KL^HB7{y+_l!p|OSAdT+Z4hGV^SJHBvtH!vvEZ&a5A8eRUIeghBz zR-u+!d0lujbjScepysdTll)6Uf#kr&#@yEV7n9OZM*_g$tjU`Hdbzlx0H9#cA*uSM z!T=8R-Z&ETW8GgeV&272o7u+DKJQlzW|6gVX{3ddROs>Yhr>ZRC>BHWrAj%SaZ9S; z?_4sCoTN{d&*Wvxt_yRF>o&%(m!OIsKrn;8J3A#;k*3poOS#@HRj^=-N#+?7;_fxy zNSN-;XN75F_0@{O6v9Crmane2mkZuT$*@Pb0}+)$F6<}h#D1JI6*H3tU#N>yxW04Z z+^9-Ui>OBEKGRs4+z6o02_)jg%HpWeY*_wgZDzJ%Os(O-PW_V-D?LW%20rC1_Xs7* z^6>}jwEm5Y!VTeYeSg9l&*+049@0x{-C~xX{e&HK=!x` zPkbn?tcRZUz(Grwze#~N0u)t9cCWs6(h>I-O*R$VK&B%LCew|r*h=y^OOYc*9*eU( zzLegC7N@HENJ3L4#wW`q!GUKE_nco~hXPZ*Tn;nb5;MXoTBv(DNbT*cOrSmTl- zj&es);*v5=C393|&f6^Nrwo)6_k8JEb9q0I)EUOf$Eq&7k&Bg8Lkp50z0OR*=JR-s zsNETDR+Hm)Y@;T#x1BJy2L91}#2jA$ym54d52(#UzPRa&kd{j*5ptu3|- zKJi2lb5Ol+Vd+?wx8Uv602cytb-6mNQ&kSvf#awdkf%0x5qqujr&~Ee1mon%nO=;V zO=MlBQ*x(6DRY?)%Jl2&NOqf=*&Sd8rkAYmmbi8Ob#Bw;Zb#Wa1`I!p3?AQ|tg$k$ z66Jv@%7{V{M+uV^aYIydLQCZ{^ln_^cMi)nl5u;LGP!bRV|A1cVSpPI!jB5?#MPL` zg>bhU`3o4znMmsnCogs_>|Ne_-vqq-3?jtDXtF~mzSSSjWRK%DgllGAL;)8&&OMKz zo6i)Q%aL>zq-G{;*Xgv+xaC0`x%?=$D*a8aFng{m8og>n40KGYEj0BNEGKSlJ)vbB{@j8 z`qJ6wQ>Kwpd7A_x@$WeAUt;0Y$NZ$DT!wL+YPC)08GL+@dquWy6Y)))jq4sxfJo-r zCIzuf%}V+?)H%1bVt#1lG$?y3<}34J3;8oYBTe4d*}~IZu;wdtBzRy3907nrLZz=* zabVQD2o$<#9U}tMbLgSOpYbu<)3mw^{hrXQl^*pX8#s&M7J+Q6K$F~(f!B@I4Iu@5 z*oW9gSP8i0FLEhOu27hX=a=i@S2bPn@Ys#9hD_dqNZodC80WlPdp!OW>7v=bI9NWHF za0Tk#6!R@q_a3Rd^*%2&Qq-SguN_sKAC9-kGO&M8nTBH!)dEl#_1!B)WZRMGb&)6| zAWRx{s~cuP1ZA86%NX<7w8gB~aLS0*vW&~3QbTON)kahW z<*S)P4e(c@2iZQ{bzW$4x4+qQxGdkRGKf%WA$l!6{1t2Z4WHl(9cJ~skFiHZNBxPa zD>w4z4A}ZeEVDP>828_F3s#6rYV7p3Z}$S*MeA3CLD1eGdV~Tyf6S>lzL)xFs5m&% zdwpE1R36;lJ!Q>BvZRNhv~q?PG^Y=PkkSnQtx?aXSg0+fhPmlpN4z)o#p{ZbGUGiAey8t-Uoi#9YYcDUIBg7tQT#0$C{ZtCyke7;gtc#905`;8A= zg6SylaXHoS32;Mk9n`jXEUu8F@Smxtc~YG^?cr2UJl}J`)%{q+*j52HBSK{Va+V?;!$q)8`zaH@n*4~rFp$$M9#Y^C zbf}Dv@bxOlSl3@U-Yq>hNl*}WpkwURNYQK zbAWt z-}2%4&S?-3D7g!L)7V>1d6I?sUiFsR&|v#s;k@?_^e_zmH{vp1yk%?iHwhzg5KU4DK6PaLc+dJ`D&SC53h?pM+ z-+T}#avyK>ep$5{X+d+49MmWBqU#g|+ar$Yu9sA--`@B78+OPLoWe6DGrmD`#5gi} zB$xdK6sxq`61y!#&kpAvA;JYq-HS5FOmS@ww%Gb(`vuB{=0xRIwlE z45fOEBSWqTdtlmi>*>eUO2|G`S_g3i9of!#%h4Jc!rvh8W)*4!Z;4-N9dy1Q96;1L za060PV@vM~Adj|hFcPn`#@f~fN2Cc^e%Pnbb8OEE<00xI$V9iQqPD`k zz=lX{11L*5ZX1fS%ye=dpc4~aE_Z*4i&RC5n2o|}*^|cak-7Y!<+6cLY8|o3Bd01W zBBxg0ym?mR-nOE=ro64{ECo|Q&DT73U!frF-LD?aR28_?Qwn!x8=@_C3j#2Yu;Fb-o3Vt+8DSk)WO2E%B?w;eHl>tZ2 z07M(ukx3t6`Dq-tY@yw2X79U=F*ibhPak-UiJiqogyOJ|_nPw!C8$3M(?j`n<%bpS z9`!h1y-`riE4GUX&3xQ3!uXxoY9vs=`EcD6AsOpd z-er9BKH$~et!0G2*KX?fqhkdkEz(1d zaJc`jDrYNm8*}#G&)?_PAKG$u>%5rlms*&uM?)U`j+mBJt}JKcc#><`mO9zAHVm>9 z$$ZzBq$=z~)mTS;VR(fYyfWUWBVi(GSZ z2xKJP<+VF^usT#jI+h+B#UZ}2Q>aMluyu7$k?M~+^FhRfkUPD&xb7NnO2=#?CM;&W z4L~h_P3i*B_6N;6fkO<;9_ex{eg1g;T2BM(?Z-}ht!f=mk~^q&$PWbHALV*ZZTRCe z(wY1`RzQ9uA8;qE3-qjLwG(UX3_-Wyg(iK^bYLkZ{;m@B9AZyYWCHeJId-kMorBBV zPr>Jp1Pu$oS>l%UDFBw_YOzq4hp|zGjv|aYM*q@Yvhe9g_kb1B@AxG%Lx!lwwo#W- zS4O;-?E)~ZbL;n2_azm3=U%>-o_Ah>7!`O$kjO^~1Y9)_hJ%VIa>_lQi(MZeUk zjs%^CKhm%YRhfGf-;Wk7f3pl1D%6tliCv_7Q^|sEtfEw_j=#wjWSnE?D+fDaN`rjv zmOmLB7nj*TON6zcOpg^G6u7Y_W2;ZaRlfVYLul5QZugR^JW>C&(T&F=L0T(iBK3&M#&)<|iMk%K0+5-BdK@IH+tYkyu=e5Gi48Tk~F1$xq| zW$tdgR`TcK;I8|mdZ!q(;#0cmYy5#pU>vQr7YQNx=r~mA2`PCFC{C_57z?_)753F2 zb~vKkYI~IW`cO;o_5AlK@MOAzz)NKnX-X`-}2!g(9YVWQA?gag{H@gUFOx+3Cs z>Os9XSAb7D(Cm!@OICv4cK>7|thYBvdO{{45aKKT5)5_4A^s3V(=b$jZ*T6T|4Z7a z(-^dcNLcs&58@lvlBh!J8j`EyKc<6V;ys>^!j^i(3MpURD&l=YmQaq@qY|{cOBkdf zjGU9AXGT=vPpJ}Jv_#V0aWvl|yiIy_d8m!0GOuT4Noh(90={e}QhY&XA{K1(bsluaqpuE0~tad|HOL_N- zcpT|=zUZorw+co!HA|=5qh-K$%ltV48JO*2I<~l|xm)PKKGpAO0@&*4J~8%^Su`7p zY-pr19r^>__E;vl`9x__BDoql0?(jU0x5M2G*4JR^HM75&KE=vgGRmD+wFsl;J2{T zMkketg^<(S5C~0TML7`KdecS|5S0S=@N)%PZdY(|6yGUzJha5=#>{w34%YI z4%Gil2&+T>qAK9g+~oF97C&xGz)~I{6_zEPv9y3se}Co-vjX{f8+n@XFr8MH8cOfuMqz8<^N%AKE*$E)&Afs zkpBJe|KqVe1wXYI{(x6d|9`{bX_Tkp{GTX$Xiua3{w01I;HmWdCx9ZvDE#Al{5K)` zX{@J$?w?o^kUL`kN8bGfCxiK=X8t!p^q1`d06@kFAlFYE!YzL9=0AC=hC1B8rEClWg8U!?CT@F_?714M?%{l8cClrcRG@pP#E e2|+^jUq`I692{i+{<3>90NW6LHbDLB)BgZAwCK11 literal 0 HcmV?d00001 diff --git a/example-docs/handbook-1p.docx b/example-docs/handbook-1p.docx index a0cc4568700a5f858fbbdf4fd5cef319096de952..0b2cb1fa5d401df60954f34c01aa506f92117f32 100644 GIT binary patch delta 3612 zcmZu!c|4R|8-B(ZYql`PIx`GP_OfKlzD;G{BNKzNhAfrFHkOKFh)9T%vXrq5q6L$E z8`)}*t;m*8wtUn3^}c<+?>oQWd4A_S*L~mTx}WF#ab3;k@6BN5XfQnw05CBD>1y4! zhU7t;1rPxAFg%CR(?S0eQbZgXnWjN43`AYJ9ne0^gj)~iMZ+{9M0fh={a7-wsSrX3 z0P_sQ0}LYjYr@)X^y6uH$q=GBQ~v(E9E15)Fc<)Sa1r@f1i+BLxDdw0`3TNzEBj8L z4tNSE(yGjWh(YY9_E+(CJl0zW0DvTjFv~HqFU=1R$>^m)4j{q;=N#^@`T!H~HN?b@ zu&BhY&H?~MwgA8d2mzt~f$nnd{%)6iv3|j_VZJ^#*31BSFa(ADH7|v+0wDTz8iQY7 z5?DO*KJdNj!H6EaytQfT<8>p?HAjc7lCfqQ>Cc@jUf66irQ{_jm;`{#1r{kgOL9fjf;bHzp8F=1Lqr8Ik#x3*Q!3O6Rr)6(EWM_ZigvB>w*#(9T{1+3G2yjUYdSUa`(kI? z7m`&hKS4}oi`-0_bWPqhcvMqd%rm@m$t8Kp*MzEa3ymjrsiAP)(n>n@+-0M@Y%bL; zZ6^EIDFL2F@QL?@lY@Q2xtldzRpWMwIP~4Ma`w9$5r7JBwXDKt%#y5vKU_iV<#m(X}1j1J7qbyEJWp9#QWMj z|02M=Xy?_l?!`szYBh~3x0V0kDRPQ#H&hQ9fLd;`?hjJw(e&0B;jw8A)>!QRo_?ye z*K=B8GOKpl(W;Awzg_=4nJLNb^~DJu2~%-}fln;$NmiLgZV!A7oRB=)`ncAIqwFm> zLMt{^RV(S^Nj)`n{V@F?Vfp^K%w=)e)3F^{vB4azr@qyN_oMc<30Jj--wQ;nl9x#PDLl;Ct+m!23%wX_Nq8U?NXcV*4Gqm){^_OoCrY__-r{(pWYK6$H&h8%NIm2YaYeNmM}?c@l`@`V9nNH+$EoM_e`WK{TycFw zGTn3#xXiSNJasEhV}fTO(;IF~@1rX=Z5tWh#A!+Hw-cbYqXxS>Gs}Df$*)$lYhqBY zk8F#{ylPcceZ>mw^%D;UJW!nM$6&ncD5zuN1L2m7;3ii^)@%}G(MQg~gb>uP;mzKE zZ*~m2OR9>g{&sl;cNGkWZJ|-ZG4z;0O)mO#EIyV-LGdZbTbNYmOwfQ9-zTT-GrSa4 z(_^L}d;N2&aQ0RP$4K&R&X|NeW#X*Um`~&U=_izHbvf1zBN6ux&ktl%pSaCK4rydH zif^aWF~hD_I=S+f3cJ5N4oHz3<&d$G242VuaU%@yLkK$@%To>duX65a;11~CM0~KCPG(@Va?x_lat018y za4n-h;_SS35>t&lGy%P8ancRCR-wxOp5c@zS^8O%i7vDD7uUF4Sc9@Xv%AN$TE^-$ z>dliMNuWyHnWa~)&aU$?wWb`s=w5K|A_!)XOfBu~X*}E!qK?+qVywvti3dJi->t80 zDBBj86S-hmuhef7aXrMv@A?>>T4`prz_Z+Tb(?C+CWedsfh(5zl5-GX7&++%ceZhB z2feftQwDl31k-D;e5UW~%04kXcAosSN+eXp`PMq_!qw4ts=AJsp#|M!qP);i8{sr9sgO z4E3Ga%E7)5iYO0{%1ppj3*s~;P0?BS3tdSG63AU%jE$iBOn;Z&)4}qENj=Rwj1D%l zC7sFDR;`B1rA_Ycut#+Py?%t^9$~m%3AgM)9B*euy)aBp=Il^Fhkm|;i1SST$G2ga zVpckNztbgM*lQcMhjXn1@6;uvBuTHR^Mtu(d{(H+f0K3=R*hsD@b#)jJ!ft`o+$Lq zz9%8t=ms(0LtCrzAnc-CN)>qH;-fDlWoH`|XNPsi*YF1uXT2fib$n_6CeQlV$ujp4i%&`lr4{WAg6t7OkrPBU!{qi9jlrJ$ zyQP;8-e>Ngb<^TK;F4{!&z@y54l>n zVbu7dciU#Y<%JP$x=m~+<@41S{4VT24YpL8zEQl_PKgwtS0CW6eWjQdO&{DH6{5Of z?+wm=p254E7<;nw==@9rdR!xFg_Ny&FDv_&_};X%dQO^#SLcgz`=F$wJhPp;VtInN z$<3CT3u(qX79zTD10px;?RXxl-_yuSa7}GM9@5C9o6jWj^{RchJv;GnmSsNSxacvt z`pyPRm|2Ki<=PX7nKe4uVOUA_W#EVlH3O3qA5cAw5SrT1=a?JKkU1jqROV&#F#f}i zveX929>X|JdgnC6m{q#O-I(^c%Qgazo4%Arx`wtyiZ@Qx@X;+6B{xA{;+ZoI1pG(% zJrMvR2iHkjL1C2W`O6KZ!xIoLmF)n#YuFN7%g#}JPKXNY_SaS}MQDHIyT=1E7Lf&a zkBDDka|{#teuFRizOiU%F=2P3400`9CB8A59)4npvO49DT9(IGrezJE+m zzxL$I%z{tQ)Sky3(};}Hg^Tvxs~@OFKNo~8h6G~!l`QOQbw8&svJckWcsulX?8X)jDER{UzILQIJ|x1`j7P8Hw%67d*I&+oggpC zM?55?wO`*dH9k$H&_riFh;UPQcAvg=i<)^kP7~0QT*OR~9q>MEsLVbUXk}yAnK6njnDi5QP*Lph@<>WUq(6e=s&s;g{+)NB3>J~UGhDL z4LtAXxj#ls9Eod&{Ei921_#q3g7$^{HXVOl0011M_3~%@VEl)KU$8aioDcT@<^ORC z0KoAlnYI!tXa4*A;BX)8{?ln*0!cT1fZ)U?KoF8(qV+8;~!eItAV6z6C_9oi7@|pd07$(6B7o2vw>KjAUWAz&p#kh5($D6 a-bo^8oo>+93Iy08QI^lN>p~U#Md)8phWr`; delta 3733 zcmaKvc{r47AIBeK8(U148Dkw`BKvkomV~m^$dYAntYvFtMp>#EDq9K-+1D(ga2h&x zY7%4V4B0gzvgIHugnA#n=Q_)Ky~q8>b3ONUeeUo5``yodUC;MBe&z)Rfw5p_5dff2 zD3E&el|At1_yxfNk!6RPz8tb!Vgi8Q*Z|<)P>LF4hY4iWB#(^FfNcAItT9W}9&5&b zf_MRB^;q}OeZwlgSHoQjwTKl00E_$S@<9vsbP-*4#xbDo-TibsIFQWp?9d#O#WiMT z0N8~6dkp}90LYOtNU|W$2q@1z?M)g2&nN}l;$M%0`GD14DKkM8fEviX--jjp@J@oL zzmo>N0hC4Whhq5_i0sgdw^-BYAnLde3IJe7z!hAum$DbmGbGUaB0(uMFyMErOYn3Z zO#k>Ysh?chbtrCaWMYU2^A`dzq{x%5L{X>je|1^Sh@{Zow=b`rS?_UR4uN zD?p0BuOVZ$||#+Hm|h z<=N01!dTtjEbDr1jYM%Lg>$*1@l?fQ!<89tNcibC-TbyV zsL*Cca+m}{1L0EB(B5iqt>N`Cd}=qm??R_hW=A-cm=#fPJ1@4OF27)fxgTztvE=$L zrYSAw3Ql)$UCTu4aMK$XRNvKkM0wBQc;8W_9+90IvFq$5^O_|=#V7B`VHYkj1Pr!M zyGvyMM1J_WMP$B6m42ByM~iLjVm{@i;W!WBn<5WE@$BP#Av^s!_~TYJVS0*(uA8Es zTKOl#OT-3S1kYJJe{Vug9aHER$6cymHo=8f?4%gsS2BAIpRiPs@HOaer?idbD~At^ z_c70QH~ox{Jt}9UEQ?&8(5QTqdLYxghN{+3FP69*otQSA)i~blYPQn)%G`;mD;;gA zlb*sWc&e)qBK0MVm9Kd^#IWuOoH=SqlT%4VK!F%>Zf(5kyEKoPz%449IGgy5%S_Ch z$I&^8nky0~H?HuWi@1|nanvsqX-G}KCAs+Me9=v;{BE*mc2m&w;1_Qx!hq<#2F3^Y z_-bO!=(bTeK~!}uJ-8RLmS5%*pWt1o;d#?1V^W)wv&1%x?Q?z9#;j3dsZC*^i8Bf& zY1~Py9n){m3qz`y?Zo<2_sLGcV5mN9M?z@sR!d2HG1MU|X+kF~z@SEXe8%9rEPAe# zoqP@xZ%!oLl{LD0R2o|*iacuq@FCmdu=gZ~o0jSiuD3hPHln>$CWnzb8(ddNbd)*&lc4~0aaJ@Cr%I(B_hAfI_dVelDdc-1%1u{`=#IL?;djsh%-eC%z z$nCIHx3yHqv%4i|l&l-@SH{H~i?f&P-b}`h*0LU=p*e*D4+Nf2aqcP6kR4%4C0_UP zqAs;_wc8c^Hb&>)G|WI*h4#ozCVqe#zX&k38L$U@CyN_w7WAwPXvbcEd(QV9Rqs=s zT`+Dfu5MFdKP_Ih6@^s`{@JzFs6z3S!qp2ZWs7E4eHG)EZcek^ij0*n(OC?O@qgoK zt~5UUyR~({zFvOSfxAh`;zMSzPi;i6SZu$X1D}tcz$Jy*eA&;M8^Xft+_E|(VZ%?X z+Ha4(E6;psR@ujyFU3jST79NPhwz;^XjuV_D2#14U0Q4CeYrphoYxd7O^dka4Dc## z*4jDK(R9&D`$^`;ucONPm@@ZTO;_&XQwple)g#qXtSw5}9B10(9EndN1$cbvB;VM9suK;F(z8>DG#h-Rw!Foc)PQvnG<<<%oTzaV&-JcO#PS!z>4 zp5aFtq8^d9Rn~wMq~ZB_Wjs+D8{~k(1T8vCG#}fZ%~}8GzwA$=g}Nk?X5)98uhl)? zZl2w6rLN=!QXjnSj((d|;2+I!%TsqP%nVNBMMEVyjc&AMW0O3$G@g^UQ-XtzXXP9S zn)iufKh3qk&2bc~pRAzXcLlOMUE0<>OwY}SHh1#r2Ym}-EJ=5-=NAehzaf2&p z``pggeotXT$Ikcm-v9V~CGQ$Vvtc{bWVw>INfiuhY8H0mp;1FsTEZo#`=+UzepM?? zP3u)RN)4jqCNyB_jLJ$s^b$;+{c9u+GB%M#!D}a=JzfS(HVFaWba+cl=&Wx^ZQRr4 zv+ypJT+-Anl2Q$cC{bN;I!!Dpt#y7e`-xcEHzn6<1}w-g3OR+j^bfl$2MbK#;Sv|M*Ux6PG^v z#9rt|PyT7#iD<$HrE}jQEb;K`NAF4HjXBiCjSjqyU zwG1Y)L0D&q=!gheTzc8L6nYyD;cyYC^(Hp!T%oNV!{lG}O0aXdi%3>oV@<7%MG z68SN@<0E4p=e4QbQXk$H^a6Om_bWAPT;kk2zHPB@#74iN39WUFg<4&Ofjgtz$!4Wo zVM)}C(HZ^$2p`w4Wffw{8MMLid!oru%HRXJPOG4Cc#tMd<{OR6sI+=z+f6vEwg;N~ z%J>+5>w;aZk)asHo7M5BsaaiJM&I6te2r5gvl{k z+iKhq?<=Q_8IFdm^3(?&K?n;S zLbtEqNk;B%!}S5U8h<~1v1mxqx@~irG#5NsL8L!-OofYaY9lK{f`{~;Dw(JD7F20> z6zNz_;W_B4XSgZ;?(BZ}a`&QLTz5zO*4R)Y;ud-3p+>?I-a%|lR-0NGU~c;JaGHt5 z;hPg9lRI&2qo0`E!QXuodEHYzO6Tgz=cOy!{x_)Xq0a)$yShyc+*__mT&6flV|IJg z-D_(*C#OcVYL5)+W%x%f_8nL%3?cZzpIDFU3+cU?r6C4xr2(pCx8GW<*bXh!wT2It zeS=IkeurQz*r96}C!=`|05HYuV_v63jQ7l|$lNqS6112NEdS1ub;V{u8x3u#xc3_L zn~nQAQbB+mh=x?Sp>KYTA9Xj3et4Rd7N5GF5 zzyuKhz}!NfJ*3A3C8Hp4vaU2IQ67v5KYXtb4oC0~R^79`7>hsHKQhkWqsKw8?!Sg5 z*zAwh7efL)yn}r&p8tRA{~ky7eD+`ClNgvfehA>b2?TIC{vYefQ_?62ZRnpBUL@GM zdjxp@lR?R#M8AU{>Ay0BumJD>5pk74K@51nOd!ns3&JaV;)4IV bool: - """True when there is at least one page-break detected in the document.""" - return self._element_contains_pagebreak(self._document.element) + """True when there is at least one page-break detected in the document. + + Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably + inserted by Microsoft Word, but probably don't appear in documents converted into .docx + format from for example .odt format. + """ + xpath = ( + # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can + # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which + # is w:p inner-content and both of these can occur inside a table-cell as well as the + # document body + "./w:body/w:p/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" + ) + + return bool(self._document.element.xpath(xpath)) @lazyproperty def _document_contains_sections(self) -> bool: @@ -424,24 +439,6 @@ class _DocxPartitioner: """ return bool(self._document.sections) - def _element_contains_pagebreak(self, element: BaseOxmlElement) -> bool: - """True when `element` contains a page break. - - Checks for both "hard" page breaks (page breaks explicitly inserted by the user) - and "soft" page breaks, which are sometimes inserted by the MS Word renderer. - Note that soft page breaks aren't always present. Whether or not pages are - tracked may depend on your Word renderer. - """ - page_break_indicators = [ - ["w:br", 'type="page"'], # "Hard" page break inserted by user - ["lastRenderedPageBreak"], # "Soft" page break inserted by renderer - ] - if hasattr(element, "xml"): - for indicators in page_break_indicators: - if all(indicator in element.xml for indicator in indicators): - return True - return False - def _increment_page_number(self) -> Iterator[PageBreak]: """Increment page-number by 1 and generate a PageBreak element if enabled.""" self._page_counter += 1 @@ -509,7 +506,6 @@ class _DocxPartitioner: def has_page_break_implementation_we_have_so_far() -> bool: """Needs to become more sophisticated.""" page_break_indicators = [ - ["w:br", 'type="page"'], # "Hard" page break inserted by user ["lastRenderedPageBreak"], # "Soft" page break inserted by renderer ] for indicators in page_break_indicators: