From 349728162e59196191b939c14f9d6973447dcdf3 Mon Sep 17 00:00:00 2001 From: Sri Sudarsan Date: Fri, 21 Mar 2025 21:57:13 +0530 Subject: [PATCH 01/40] Matches prefix to verify presence of DOCX,PPTX,XLSX files instead of standard file names (#3959) Instead of looking for presence of `word/document.xml` , `ppt/presentation.xml` and `xl/workbook.xml` to identify DOCX,PPTX and XLSX files, we look for prefix `word/document*.xml`, `ppt/presentation*.xml` and `xl/workbook*.xml` as certain files generated from office365 has files with different names. Fixes https://github.com/Unstructured-IO/unstructured/issues/3937 --------- Co-authored-by: Yao You --- CHANGELOG.md | 9 +++++++++ test_unstructured/file_utils/test_filetype.py | 10 ++++++++++ .../file_type/test_document_from_office365.docx | Bin 0 -> 18752 bytes unstructured/__version__.py | 2 +- unstructured/file_utils/filetype.py | 6 +++--- 5 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 test_unstructured/testfiles/file_type/test_document_from_office365.docx diff --git a/CHANGELOG.md b/CHANGELOG.md index aa47187bdc..2fb45d5385 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.3-dev0 + +### Enhancements + +### Features + +### Fixes +- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml + ## 0.17.2 * Fix Image in a
tag is "UncategorizedText" with no .text diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 8376e4440a..ec6c805f34 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -15,6 +15,7 @@ LogCaptureFixture, Mock, example_doc_path, + input_path, patch, property_mock, ) @@ -30,6 +31,7 @@ is_in_docker = os.path.exists("/.dockerenv") + # ================================================================================================ # STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES) # ================================================================================================ @@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson(): file_buffer.name = "filename.pdf" predicted_type = detect_filetype(file=file_buffer, content_type="application/json") assert predicted_type == FileType.NDJSON + + +def test_office_files_when_document_archive_has_non_standard_prefix(): + + predicted_type = detect_filetype( + file_path=input_path("file_type/test_document_from_office365.docx") + ) + assert predicted_type == FileType.DOCX diff --git a/test_unstructured/testfiles/file_type/test_document_from_office365.docx b/test_unstructured/testfiles/file_type/test_document_from_office365.docx new file mode 100644 index 0000000000000000000000000000000000000000..fd9ca065eb8f6491cf8b1269a8558f3f7e9a2c31 GIT binary patch literal 18752 zcmeHvby!?W)92vsPSD^QEV#Qn!5xCTTW|>;JXnIeCAfQVcXxLUzLVVCH}c+<@7euh zpWXLRJguZM>LV`&1`Z4O>mUX^0S|c)0N`6g2EM52Xs_>Js{Miic+eZ^ zJAQlr$A|y(@Be@VNLW9#rj-`1Ai)ylFY zW>6NC5j{{hXLo6L#Sgn3p0V6EqZ_=_$?BLP;z7zoU~5+Et9Y~DArbuE(2Vx*jr)<@ z?dcV7+F=Aa7qkM0=BR>#BIqmjf`i;lO-uSx%C|iw8*}RUrcm#O_NcC7gIBo0m60X- z$}2MIGCwe2iogYZc|Oa^5djxuSRaJoAd>}7n>vGA-b3GSKcE+-N7e)94VxX@*^Bsu zOP$v9&~;gOE<|wsG;7onXmMPHiZ_0{$3^BBRw{6|;YUgps|UA%k)C$|mHZ(WRtkCk zV|h`3I=d=U1-e!Tw@UG|Qj&{-kH!W^;oH?D2dy>SKGIa%_Y(b`G8$qdea)S2{fCen zI5+cmxVBzj2`){w;&$h%?(qAsC?GRv9NDZYXTw+m4rd>%!7V(C#4RasUZy8@sS zoLAPcW)*+?{?zmR%>&7sr~AvVjpLz~fna8XN#tossQR_q5}ygiBZiNH;(SbAcg4Ll zkiG`pIUhIV=qo;}wabzgfpb)6%E;ZfvVFt1!VWfw^ZHm3cC9hKq#Y zppe!(NBDd+v8p1H4ugP5Blcs1xG}OKp}CWoc9B>UI@ee@I_CS#!Lr0Sk`5O8TflQO5qDg9*c151dsIn znI*JsWE(|V+mD5+^=wyt*;jq{X{uviA2$W#z7o0`E4soNk!Q$TdW*zi1zh;i?#c+a z@i9*5xq73L4U9FVQ9hMm>x0O71Af=M>(#9uY;XYJB|HFt^0($W7&|(eS(`X~Q@pBI z+Sc>z$jzZ2x1DHj#OwmvXDaSgGsgxo_C!<@>JTf!E#L_?l_%8tHbFMJRy%806I zDttk&+0C=p2fieNz^8+lI?Ixk?4`ekEqI_YR<tjp|CUss+j9W~U$_o4yn@3N^eQ+Xk74MU230bvuQHqx@Vu{W<0VQ|Bm( zK|ebAxeWFRcatVufqeH0sP8g}JWDw^OtMr}i*4yu8A`9xij=)zo8bIZvsACYwgyUk z$9zyEwt(1yW`qLeGIumxRf(eiAf~_>gh4VpoHVR5^Lfap_;>L&wFIsKjlw?qAsQ-_ zP;-Mh43bPXA)6n@1~Oh!6Xm&>aT-&U%$35?gWBa^WXIBz1t~k5mF(Q);ohE{ekI*H z7e@^W$V}pQ3NIc=e+QA>lDXU~=G!tgC+}m4yZ0jdWE?)EJflS*#dSf|NFcgkHT#Wf zY#7)V;Z*dTaW|vxu48e9vKt$<_r%5p{OEi zA~0h`&~>A&G)S+_*lpuAw^5zs>NqaFSQ@fl@aUXGhK7YDq zHPD-9WfoFk!h~$UufPRXg-)SSh8x#SmwJF_Qzv8J(&+?^VJXtFs&%=#kmu7;_D8U` zc($@5f|nC!;cs`m(%q~Xt3EXnNhzz)HGOj8M!?ZdgmeWUm$#pwOwoS2Y~6*7upZe< zhUl|$(>n~XMk-DnIYf9^DxdcsJpi6I;{wmy0v!AnX;#f>M7to9jzk)C#k)5mS6-w& zb|r8@XlI5lR5xMQlFuVdr#h9(^rEpi`xW@g&4(I9>hX7v5}DnnaR_;+r=S@YeV&@6 z3c$AvR&`!mWO!6(b(Y*f&##+1FW!GprFRj1c@YG=74WW)=E*evl(LJd<+G6G#av4G zm%#wWC};V$L%wK(AeJ_r-Lx+y=TUN}q&Q3D=Syqk1lK zKd3LpEar~)`{l9IgAXW9!{mWNmhMl;k83?i3RIJB=R;u8g~8@^LTuf!Koj@$-!r& zQqsq}uxt`i%4{!U51z+2( zAq}pNR(!3JG>FQj>ObHnzCIY{2c>U<)N0H`X<(0vCF`Bhx98G|*`U|OXZ(uM>!*$k zIZFb@=PZ?!4JWDa>3pU?k)==gR$ovk2wQCTxmpg{oEKK*;1`V`h@}{C1&GfdN)lFa zTX&)KSOE)hSN^u7oeW-DW5Kx5SYIEa7f|2k6t7-@y=VE*lZAAr53k-=Z+t(Xdm+dq z_Ts^)IPyC7;O_?nm`+AXFnwQc2$8NA@$`?cz$ z*nKvV=#BW<7j8V;o7)N(*=63?7J zBaMiB!%?#B&h%9>^3l0VZV>yT*N0SW+D4==!?StyN~x?mN_dK9jS=60WPrJEt+&UR zIA)XJ#MCr0Z?i2im09qNmfiscx zVTTH8Ck&h|h5xQ{dSh zq6BFh#cshC9ZGxDq%q92)4`BnVGHJ2)hIn@MDu8Lge!LEH4YIfV}^RVF#opfPAa)2 zQJ5ePUPxa1cOM@4A?9%i1@7t)ismZR+jwTd8cn^ zEii1OmO5U-oM(>0ArqwZ@Jp^f7gtOrnzKVGy?@K*#!(P0UQ18$(bR$gjxf6%A}%S! z{)|dlUhmCQZ6?fq$KtM#cBek}vJxS*TN;_&s)N%*%4?U07R}NbQs-z^26q(xkb@*8GwrGqmYs>&wS|p^axA6 zra5I5NuYpqkNL8abYuurcuqH6Y$?=69W^5-^n|8%J!i?kaOro-0 zdyICYZqQ90II)oG7{EQ01SoGV`1mveWC`n8($?p)39x`>xe8w#mD^Mk19&BtV34L< z1a5vf12Ci!cjVMh)c#ydYi?{OfZ+yTD37Jj(dACz_Cc61&>m~3$d5g&@C=CSM(M%) zBnctxG{DB(t=}B3xFZC|zrhU0|CNMx3T-9+ZCqJY%us z%va-Z++@vZK`V>hbV&*wR^RZK?v5zImYkxOYFr*O(_K;|^KuM(yw2`83-nCu?g}yl z6Lw#pQOc3gaos%II`jRDEpvlb2MP_y0T`{T7 zZfQHWj)t7&g^>-pLrM0M|AkC}zWZA#gmGW0*COYV7Qv!%vph%@4IPmr%)_lKB;GgcMk{tW zM!hl*Khl?q80CBOzW%g-B0`s({LTBMuQ}tA@`jE!^!bdp{ZrsQuky$;x_2lv(MUAW zh<=ciYIQ=XC0+P;^0B1*g{}`_D94W}G~tMTkfv&Nb*ZIQ_;>QLwEC4~X6LYH=Kw#5 zv;LElqw&6;(KiD8PNscRZ9P*Uf74`qNfC`E7Rh%a(!~0ORF$gfMXTxgzmZIMS~|6f zuu2(t_!YK@O0&+0VW>3bzSP6Z%Lsv4g(v%hHupv@JKQfWqWd&)36o%CNr3{ikj;e8 z*h4n4J`C@@pg6pqj44GT(Op7zsEhznXmA+o{uplMEHk%*J@;xlJEFV5%0*@_pXe~n zS1qJc3#+5BKy+_CvOp~K!urdFj8GI;uH~HD7^Cs{z<4ATzQ5SP;fAcLVcq23G-PK! zNv9fu{#BMVht)_L*Gpj&)Jf#DS!JP%ES0yPp<2v~ABBr5u8bBYOtgfm%|n*LQ;51{ zh}OxY?a;bvbB4c>Xy+@TgDZt`YS`c77+GGAfe%LO);SK!Vi*Lfi6tE+XROq~hoi*r;hx_UiKACiu<}BoSs7N*2 zTo*!C03Re~(-(_4T|Au2_DGLrn+iYInOqkN>;qqX_2&9OrwX;nsUoYkTLG&SZSUt6HMp~0ixYkE*Du-k}M?(av@J=e;t72NWE7LAn zXJJTnXH;V8Nmw}xP{oZB=<2dEecTZ8%*-}z_UC;z67lbQZMt5S5yb0;F21YUJO7kP zX~7&suDFh2Ask&r%j&nFH_fuKg*oU(%L z!1^%vf^Wa>YUE7jOPX`yvCv*hW0_L_?Bgg-L*ENmw?0}5<(XHCZd_q4B7W_20mhFU zbN07%Y3Q90ql=gm;tvyKstv3Jq`E54kgT<4mt3QDEti201y;si7^AeB*Iy3JR?%r@O~QmBYV^ zsm#+?qP#NHkm`>42LcmM9>sWENvC*M;*mM&zMWG^4CwUBOOeg@v_j z+i;Khsxl5T?&f}3pObRwpuiO^*Y#X3+qGG^IK$OVQRy^S@swQcbWA0ksHEf;O4wIn znv&tRLllIEVh#tQI9;LDPXwgM1>F_jC@Hr3A(#SML{m~kQ%^*5IWTjO0{ubEHY9DY z(xY%Zs9>Bol@shH7zarxG8fh=2iB?__ApU-HJe^glTL8IPWUTm<1u>5xRNEJwB?YC zGKP6*zd~p~NLc?3L8o!c9ImXAw6z>#9Rp)sS@xEF?)`H0=*{ayi`ZpwlxIku`_JPx zspB{O;y0VKKONM6nrOGz$)0kD9C9hNTu?GLOzP`?p=^w=p<Fp}V5kI#xkiy6v1g9S<9kZv{j1;7^nV#Nrhck)Sjg#Whk!WaaoA2T1anvxT#K58~ zu&!@cDy}Sur^BUr1FqjMUEO%Q$Eh_eXW4MEf#p_+1YvbPe;*?+SNC$3S#vg)yD^yx z{uJak5=3I5h%jP0${Skxc7Z0CxK*_K?qk^z79UJnjt zl$9)FUgr&b2*cQIPxhF3H%)9%T08=EoxXk;xHOzLY5QO)SE7s|7D)IOB4oago#pD@Lr??FnF_7LkwW6k?r_;^`FPZgAn!UKU9Ao35P5+4gKow6dfqPM+o*47YqN@xf4W3G9>`NrJ z_GpYMCVq?cSu5IW)%NF|^3~2-)2gqBPpDv>v`=iM-uOrNl`b<0V^H;S8r_WEDpjVt zzJJdhvcrqr!sAGcpETNq641sB<7AxQe?T#(FvnZyLJ{85y7}aD3c|m)9~@SN>4%%&7SE)iy8i0d3;qd#eBEl8V_P0>%vl03h`Y0KoWL zXpW}FR>m*BKQewx10JetmyNSxKhnH9lHyIyhROW+rX3Orc{}{A6q_BILn?ddqJwN; zG~k@7qy{yj)^Z77=Z$$Ry|0dk)uxFO&lOnIS|-;}pom6fp@KVJ>c7E=40XS8%L>zRI7j5Owks?gGyXp0}-8cRNCDZkML2 z%kI1@wQvlQy2s5`eKpt4-f(yo2HRu_H{+~H3{y_@1?SkD5Vk`D;(Tov=|ws(WZrdp>eAiK z#pA=c2Va3{q8_e_*>KCz^T%GJ&Pf!7&n+h^SRo;pUQykS z=Eo?H+KX&_AGVPptdqJ}`gSSm9On!5#wL3K@iirxS8@@BvpS;cX%}q-ZuhW>$_fH> zR9puVs~}IZvB863&Zt1Rxnda_iWhbUx}E&Rs8E1%qbBzBeM1emRVy9*EpidQgAaw) zV0{1@%0~`MXso}hsG9&GN9Ev9E6}%Nl+Rz<=man$qE(qEMN?nEZd54lJaQwM zN1AAX^9Tm`DTH7v#guM#6;EgP%abx)!p@MKXCn@B@G1+I@kRIj#Y1xW=f)=ZAuR!PdtW_t5Tyi|h@m{c!q0EN# z9z3tC=kdv73QOW@mJMmS!gIO5!adCD^^NGtYiq(0?#^RTbHa9YcYKWbZLinfA7+MAo@u#PPAv4p8vy1|-MWTh%i;B6!tzms=2iJ|)xGb$4Z@;;2|EEI-Z0_vWa~Td z`KGH@XEB}rm%xDg5I4|s*v84Se{VJ|o$Zqcs8;TmSf&qgLu$7d_shReu=YGzFN%Q+ zm8rnhcvJx1UlS~F4c^Jh*xHfl+eAT>i4CCq_-%Ii_ai4>!g`(ooxe=b^n>`B7P0II zxy*_PNY-v4S8}HC^Hlz^gvBDKQIbp8hY39z1Y|Wvz1aKbPj_g+EiF$A3x_%UT;}-G z<~TXi5&`+oIkyY7L^XA_=$>zIzv95u7&UhMNMGO3RPaeFwj{fc1VEXn5=2X~)zrH- zgCa`iJRO5pltzF5a+GOGqzxv5#_HTPE?a_!=@r@@o?62f^I+|XwGGW=R^>LMjFL_Q zVa;yT`NWbs(E7G?o7$B}z=6?}K1-*b%GC4HNJNPEV!Ql9P6Ut0MbXWSL4GU7YGofJ z4UX{Wy%?wHMGt|4?42nU@-(ei{!n2wJF=|~3&YRIsBnjuDRO&pFknKmx;f|@ zt7qMwGpbT_5K79tsM~eK*1OVslAS%)jVtgX;6D>fVpGK z6|~54D~KabRx{YZCE9Tr3?&_j#3`fe%8cxPt6r;h&feHCF$1`DECoT@e z`WeF@d=DAgPKLE?G0Vt|5%B6*(%C}gxwQx%(Drviw&wD+3G&(#BZ4|Ysz|#8PQ+FI zg2Jm-vpe_5?n*Q!keZEtg0d~L+hI4`yE#}o`O*kij>llGyo6B%gn39+c3~^Wu8&?g z5zVwmv8R+~&1H0<9j(icdJDu~^*72{{4)K7)?c1YxH)>_v>lf>D0JlurYl@HREb7E zzOHMJS?UDA($K6SKfxz^gTNHBdH&)A@~D;N0bQqx+J7zYa~HPGCiQrN|9fMj%1cZK z!k3Tl&z>O)xFTG;aK3MiMfB8RQk#4G?64_lx*!&U&i!-vT57u#d+>+=ZRHhD#lo!y zct*8YrR~Wqj`YWa%gkG@jn&>DdlNsJ`mJ=I^L2G*gf?fp_HrtA`1*y#N3d^t|9d$E z3ec(i*8V7mVEj)x#J`X9@2kwei*FJ5sc-EM@iG5DFTTGlDCCa6-ly6NEuY!xN)k+z|$Raj7qy&#oBSm}s68TuGtweh6wbd^$ zL>vumaAZl4`(LRvFIlr)Ir2HJUSFJhy78pdu}yCiE^clbxHT;dS-(o0pKV{P50eYa zJy@Qbn@Xg^NP>Dh;90kdE@4hglRJz_b&$jozi61?lQz2#3Td1oh1q$Tn53mM@Y-mL zF4wgAxl|>4UeT_nowcTqChe4(^+8%(@0^`*P-v1c$d)vNt}H_s-qsyR?vN!XkIQpv z6aiEY45G?R=m>>4+w$~&pHS(WsyOZwMTtPK%cH1k4}qyVl?37Ycaj)r?}*?Lk=W`N zXu!glA)BSr#|}o3OvMBIB3+9(h#%W2khN@_VNl4&Cq3`8wABsFxy;bKVVpU;txNBXAIJ{fpXqR#qrRTo3T%~(4 z94(b#(tp!s>Ap(;YWs6M2u}!VcMe9P?KP^4@$CIAx+%&rbb)*G^& zGIZ)`?rEo;`xYTa_kp-)hDbYt`uUUQFH>)yX+j)D#sC(T9>LALR7qKk_>sLk*f){W z3Kq}SKc1M7ZWN&}D0QA5HBk8~yz)GM%u%y02;jqcp%KQO-KVgrOceXj0HSqO>LZnr z$fKGyrc1d`$_c8Cu|Shk-2QPmTugH>izOiQAwHAA$B)cd;Xw{VJoA&5+_WB-1U?K7 z;%E;XW7`8PXNXq5y*2Ef zRHQQ0V6?;~&UCU6;k3o%)tkg1n1X%9Eld*U^f6cpGR5j9g|rPQW{x=iedgHIbT7f8 z&3YesckoOzAc?W^SE4&P@+|<;P@CR_`J7N9FiBkSrc4`K%?x+G@($3Fiikq`Rz8BSjjfHQrmO6X7MKOI@Rde%u$00JzcBim z_dd?Mz41k^oW(_WjXnAyLO4ho-VJR?pl~b5Wh?1stF$7)YjM;A^hHJXqfV#3s*zcm z^?K@PFGd=ia6ZF<)q>J50Ug{1A+NWki>)Wi=~SNr_2UTlLZyidupn)vH(AaL7}`Pf z0?*;N1KRjpa^Xxk9iY=&11LGnUeHu_*i?As7Fi2T6s|k=vNd>;joM(>^Dmt~}&4;iLND5SDar_9u zv0s3n5oKw!X-XaQF-i0ux;r@%0S%GBTv>cn*y%!%G-v#-3f?GxsA*D3Zt<{>{^q5K zRbsI85_l!Zkw*4x!Nv|`T6=aQI8H7R)D?x7+v`-8D1L-vjeO^1w9K!Jfm1x)nECFv zJzx73W}O2xt2fdyrnzjg!pHOY7vCdHyzXO)d_Ulx{3dG{{WfRvbYC9}dLJf`s>){t z3ii2vFMib;%Nb=&k`2rS_%U`(tg6(hO92qi1n|Ygsi~Pjt7%UDN;}@+D4TPqb6(tDesb#RpBK=<8LXf|+Btc)%45YrgE6+YNscKD-|yofrd^X-qScojWMW3*istuoLkxe7%j zmXK$Z!jq|rG3h||r5ba0)#@{}{OB|441yBLF84+TaF0d?VPFdYwoZ@Q6aRWe9G>jB z3%YyStaoN4CKDm|%#h($Dv|=MDnZlg)z=cdmyjZ_=Gf_*Lod?e1`b$UGKMmqmhe1H@(0{6$bJCxmH5J-`C_`xy?}$5_!~I$`C&wC&u4qd+ zF_8XTsrd%#-9((VpD$zgmsbTVq8F^iH@Q-A?UG*B4x!1nKc#*a|#f6dRZjO0RWXF!H1^Baq$+ZkD7@r6~`1np)I z%_*KfqbFP@fD-VHb5DOw;7r=+gf}UMFL{>MhGv4xoikgQI2g7?8T?vN#sgp%OF=^i zk*MxEF~#UB&3e~KOwH|lQe{@vrlviHec-RG%gEEfH95hd^?A@(g7}_@z(!!2g3u4^ zN$U%6iJHrL3HVd~z32C`(jY2;@B9yX{=f8V5hB_`fjuvB4fZJHdY9x_EJH1-U+i~> zlVl~XTQ$;1A%iY(@AA2R!DA);@DYdNw3Q=58nG}3(igp!aCdFx_?nb1B#!6uje#Ik zXZbQ+bG6sp1@B#)Tem|uWgkNVw+o_*8qp64oZVYd>2^?fvs$WxQP91S$VHqR2!9$aB!I&}@CXZq$A3g|ns zd}YV^dU#!gK#BPbszF{D%*ANN`2*o03KorV=V!TMSh7#P>ZD#4gVx=8b(sMbEF_zH zIjL@Az4f2d^^HMXDNchZyg7_fVSGe9Pnl^9!0#@XG2x$F~t6m%#n8_MO`12-v@ z7z>xSV7h#;;Jp=;FvxdGC*_`;kbduftptka|Fi!8J|6zl|G%jG&;EZ5W^hT_6EHH{ zGe&8A>)2)?o@X^fj>eR=)57MeA#lQeeA*mk=3K*2MDzLz9ig}o7hP1WDJsi%*}7%H z&!D2BhFOHgRxC)n;j6hUyCxM$x~G8(bSH1dl)80RLe2>vc#g^o}e`knGY~RkWbIiJD8Re6 zorF)R==)hedpOg*`-f254LF{^wLd~_mjCBN?O%HI|GKAt4b#8(G0mAhp(6pi=oDCc zkN!K;!VR> z4|1hb3j``QMyje&6rD;zTV;)gFPn~i`9HT0${vYJX&YzTHlBXd&|~H2LBvMYAEI9iA9Qe zV*(Z(jTMv|Wtc#9$4H;UBx@BNHP=)kAF{g44LPoB_60XJ#siptn8%yv#_Z>qT76Cp zH0j%2S;kMW;u7&1(|s+Ucs+wLl&}AoHn&cddZ2+?67yYCph5n%ukbJb=`T(BkFWI2 zs-}$$TKhA=16J;`n}p)HB4T3@k>x?mau%I;oW+PM#YqK7jZ1Qy?i3I6t``kvA1x`C zbuL0MUpBNQgN|e9WrtTT>P14l;W@|xC)w52z_gCARU zCH7iLG$fWHN3#aj42`KN&1qkVo$&49odC7|raH|!4A=~^oAc71>FxSSO(NZ6i;>Bu z71y-&iMkKgr8iGCda^1IrBe%G~CA>1c%r5Dd#(F)3^qh;sy zVqxAZTrAt1WS5tJaf`b4lK$KjtiH(bInPv2eexp%=bjfHLTbaqHXG3I_TW+y217EM zsU{5D3iuq)wAM@UDiC$Hl{AJ`<8umwS}CPI?I1sg8kGrw)2AZ?v5BI3o&EbhpR0%h z#3o-=PK(0S{hO0nb~C7qU(w^~tsi>-TQK*fEc~0d~h4uv_4O|NnBazK_9@7#S&G zAz09{-xKjb6W6?oaCSpe$Vf`hEm)Az6&M>Ksgl)GQw5Pmz6BD`>$9O)lc3@$F0KNX zNVAL5AY%nTgS;jjZQTxwyUJRq_<+=mUanI8VVJ4G%gMb!ikOACAV$)7D*8!=kN~L8 zY^druSi+gjlp!&xn5c>`{I?PA-G#Kr43Dq*9XgetDG4nkJ(EA=J>jricFNikE2~LJ zOLYZxqd0(15ecc|>r2X0pzBJ{fn2|Ipr^=24h@Iowo*>Bxwx)0Rqu&LHYapj+NNQo zsBGK_jYQ*PCFI2&kPNnendjgkf!`#vcF1?fVbmOdvSY&f;irUxL~S)#D>%HJfr?8B z_02~D-p@b?xbGOany;(uW^3%A{cYxJitqpCV~Ti_onew$hXo+-8G=}5plX*prAb(< zCLZkW&#RqC+=Ir(VFm~)O~lENnrtmO@PwvaPuDz}S~-3SK#uBP78EwcOl z`13Xte{uh;^853~2|u~t0P+51t$y4_;h(jqpX`5bPW;J^0W=AJvHxqU;-AcamX!WvPR04> z0@FVc{;aS3i7<-$1L4nV%ReE0FT42(002A?{ebu_YW$1;ch&i|eBmd;CCP8y_q~|m zpSXWrz5mJ0LiQW?_qF`Lul?`L{O=3iKY0NFK61bxdHU}u`ac1FPwoE-kU;k*z|Tbg zpD4fQ!+%A|qyH1-dw%RsfZy}XzXC+R_!Hp!W&9(`@9EiJQ9dyIit FileType | None: filenames = zip.namelist() - if "word/document.xml" in filenames: + if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames): return FileType.DOCX - if "xl/workbook.xml" in filenames: + if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames): return FileType.XLSX - if "ppt/presentation.xml" in filenames: + if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames): return FileType.PPTX # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root -- From 347a4e5d9ee42f32c1186f0f0dada93bf9910778 Mon Sep 17 00:00:00 2001 From: luke-kucing Date: Tue, 25 Mar 2025 15:38:47 -0400 Subject: [PATCH 02/40] =?UTF-8?q?manual=20trigger=20of=20workflows=20to=20?= =?UTF-8?q?publish=20new=20image=20and=20new=20vers=20tag=20in=20=E2=80=A6?= =?UTF-8?q?=20(#3965)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …quay There were some open CVEs in the base-image. Those are resolved so triggering a workflow with updated version tag --- CHANGELOG.md | 10 ++++++++++ unstructured/__version__.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fb45d5385..20c8650e10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.3 + +### Enhancements + +### Features + +### Fixes +- Resolve open CVEs + ## 0.17.3-dev0 ### Enhancements @@ -102,6 +111,7 @@ ### Fixes - **Fix file type detection for NDJSON files** NDJSON files were being detected as JSON due to having the same mime-type. +- Base-image was updated to resolved CVEs, running pipline to manually build ## 0.16.20 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 433383a01d..af66a65e41 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.3-dev0" # pragma: no cover +__version__ = "0.17.3" # pragma: no cover From 3f07840b80a1157ee64af23302f5bb48dfc5e404 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Wed, 26 Mar 2025 18:37:03 -0500 Subject: [PATCH 03/40] chore: deprecate stage_for_label_studio (#3968) This PR is to address [a CVE](https://github.com/advisories/GHSA-rgv9-w7jp-m23g) that appeared in a recent scan. The CVE has to do with the package `label_studio_sdk`. This relates to the tool Label Studio, a data labeling platform. We built a staging function that takes a list of elements and converts it to a format suitable for passing to the LabelStudio platform. We don't use the package with the vulnerability in the actual function, we only use it to test the output of the function against the Label Studio API schema. Even the test where we use it is sort of questionable in value, since it's really testing the schema against an old version of the LabelStudio API (we are testing against a recording of the Label Studio API's responses stored using `vcrpy`). Label Studio has fixed the vulnerability as of version 1.0.10 of their SDK, but we're stuck on 1.0.5 because 1.0.6 and above require `numpy<2.0.0`. This leaves us with several choices of resolution, some of which are: 1. Downgrade `numpy` to upgrade `label_studio_sdk` to >=1.0.10 to resolve the CVE 2. Drop `label_studio_sdk` by either removing or rewriting the test. 3. Drop test and dev dependencies from the `unstructured` image. We've decided to do 2. _and_ 3. This PR handles 2., with 3. to be a follow-on PR. Here we add a deprecation notice to `stage_for_label_studio` and remove the offending test. Normally good practice would be to add a warning of future deprecation to the function for a reasonable amount of time, but in order to address the CVE immediately, we're deprecating it right away. ### Testing Install the dependencies (`make install`) into a fresh environment, and `pip list | grep label` should have no results. The scan artifact in CI should contain no "high" or "critical" CVEs. --- CHANGELOG.md | 9 + requirements/base.txt | 2 +- requirements/dev.txt | 8 +- requirements/extra-csv.txt | 4 +- requirements/extra-docx.txt | 2 +- requirements/extra-odt.txt | 2 +- requirements/extra-paddleocr.txt | 6 +- requirements/extra-pdf-image.txt | 12 +- requirements/extra-pptx.txt | 2 +- requirements/extra-xlsx.txt | 4 +- requirements/huggingface.txt | 4 +- requirements/test.in | 2 - requirements/test.txt | 138 +----- .../staging/test_label_studio.py | 61 --- .../cassettes/label_studio_upload.yaml | 414 ------------------ unstructured/__version__.py | 2 +- unstructured/staging/label_studio.py | 6 + 17 files changed, 45 insertions(+), 633 deletions(-) delete mode 100644 test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 20c8650e10..875f098612 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.4 + +### Enhancements + +### Features + +### Fixes +- **Deprecate `stage_for_label_studio` and drop `label_studio_sdk` dependency.** This resolves a CVE due to the dependency on `label_studio_sdk`. + ## 0.17.3 ### Enhancements diff --git a/requirements/base.txt b/requirements/base.txt index 17a25c4d40..78fc8ce871 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -125,7 +125,7 @@ tqdm==4.67.1 # via # -r ./base.in # nltk -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -r ./base.in # anyio diff --git a/requirements/dev.txt b/requirements/dev.txt index 0de6c4eb02..4b489656fb 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -32,20 +32,18 @@ packaging==24.2 # build pip-tools==7.4.1 # via -r ./dev.in -platformdirs==4.3.6 +platformdirs==4.3.7 # via # -c ./test.txt # virtualenv -pre-commit==4.1.0 +pre-commit==4.2.0 # via -r ./dev.in pyproject-hooks==1.2.0 # via # build # pip-tools pyyaml==6.0.2 - # via - # -c ./test.txt - # pre-commit + # via pre-commit tomli==2.2.1 # via # -c ./test.txt diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index a5779f0a87..51885ae7ad 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -14,11 +14,11 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # pandas -pytz==2025.1 +pytz==2025.2 # via pandas six==1.17.0 # via # -c ./base.txt # python-dateutil -tzdata==2025.1 +tzdata==2025.2 # via pandas diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 7cdf55c7a7..b6a9158f4f 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -10,7 +10,7 @@ lxml==5.3.1 # python-docx python-docx==1.1.2 # via -r ./extra-docx.in -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index a157708ebd..fa8e746301 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -12,7 +12,7 @@ pypandoc==1.15 # via -r ./extra-odt.in python-docx==1.1.2 # via -r ./extra-odt.in -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index a5264d7840..84afee5161 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -107,7 +107,7 @@ packaging==24.2 # -c ./base.txt # lazy-loader # scikit-image -paddlepaddle==3.0.0rc1 +paddlepaddle==3.0.0 # via -r ./extra-paddleocr.in pillow==11.1.0 # via @@ -115,7 +115,7 @@ pillow==11.1.0 # paddlepaddle # scikit-image # unstructured-paddleocr -protobuf==6.30.1 +protobuf==6.30.2 # via # -c ././deps/constraints.txt # paddlepaddle @@ -167,7 +167,7 @@ tqdm==4.67.1 # via # -c ./base.txt # unstructured-paddleocr -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt # albucore diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 0226cee3e6..061fb6de3b 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -158,7 +158,7 @@ proto-plus==1.26.1 # via # google-api-core # google-cloud-vision -protobuf==6.30.1 +protobuf==6.30.2 # via # -c ././deps/constraints.txt # google-api-core @@ -180,7 +180,7 @@ pycparser==2.22 # via # -c ./base.txt # cffi -pyparsing==3.2.1 +pyparsing==3.2.3 # via matplotlib pypdf==5.4.0 # via @@ -195,7 +195,7 @@ python-dateutil==2.9.0.post0 # pandas python-multipart==0.0.20 # via unstructured-inference -pytz==2025.1 +pytz==2025.2 # via pandas pyyaml==6.0.2 # via @@ -256,15 +256,15 @@ tqdm==4.67.1 # -c ./base.txt # huggingface-hub # transformers -transformers==4.49.0 +transformers==4.50.1 # via unstructured-inference -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt # huggingface-hub # pypdf # torch -tzdata==2025.1 +tzdata==2025.2 # via pandas unstructured-inference==0.8.10 # via -r ./extra-pdf-image.in diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 41b37f70f0..30e77d1ce7 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -10,7 +10,7 @@ pillow==11.1.0 # via python-pptx python-pptx==1.0.2 # via -r ./extra-pptx.in -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via python-pptx xlsxwriter==3.2.2 # via python-pptx diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 895935708c..937191502d 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -20,13 +20,13 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # pandas -pytz==2025.1 +pytz==2025.2 # via pandas six==1.17.0 # via # -c ./base.txt # python-dateutil -tzdata==2025.1 +tzdata==2025.2 # via pandas xlrd==2.0.1 # via -r ./extra-xlsx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 829a0448d4..f9e62f5266 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -96,9 +96,9 @@ tqdm==4.67.1 # huggingface-hub # sacremoses # transformers -transformers==4.49.0 +transformers==4.50.1 # via -r ./huggingface.in -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt # huggingface-hub diff --git a/requirements/test.in b/requirements/test.in index ca9d2d5bfe..e9b8fadbf8 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -6,7 +6,6 @@ types-click flake8 flake8-print freezegun -label_studio_sdk mypy pydantic pytest-cov @@ -15,7 +14,6 @@ ruff types-Markdown types-requests types-tabulate -vcrpy grpcio autoflake liccheck diff --git a/requirements/test.txt b/requirements/test.txt index b64b5d52f5..1ebccc8953 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,41 +6,21 @@ # annotated-types==0.7.0 # via pydantic -anyio==4.9.0 - # via - # -c ./base.txt - # httpx -appdirs==1.4.4 - # via label-studio-sdk -attrs==25.3.0 - # via jsonschema autoflake==2.3.1 # via -r ./test.in black==25.1.0 # via -r ./test.in -certifi==2025.1.31 - # via - # -c ./base.txt - # httpcore - # httpx - # requests -charset-normalizer==3.4.1 - # via - # -c ./base.txt - # requests click==8.1.8 # via # -c ./base.txt # black - # nltk -coverage[toml]==7.7.0 +coverage[toml]==7.7.1 # via # -r ./test.in # pytest-cov exceptiongroup==1.2.2 # via # -c ./base.txt - # anyio # pytest flake8==7.1.2 # via @@ -54,47 +34,12 @@ grpcio==1.71.0 # via # -c ././deps/constraints.txt # -r ./test.in -h11==0.14.0 - # via - # -c ./base.txt - # httpcore -httpcore==1.0.7 - # via - # -c ./base.txt - # httpx -httpx==0.28.1 - # via - # -c ./base.txt - # label-studio-sdk -idna==3.10 - # via - # -c ./base.txt - # anyio - # httpx - # requests - # yarl -ijson==3.3.0 - # via label-studio-sdk -iniconfig==2.0.0 +iniconfig==2.1.0 # via pytest -joblib==1.4.2 - # via - # -c ./base.txt - # nltk -jsonschema==3.2.0 - # via label-studio-sdk -label-studio-sdk==1.0.5 - # via -r ./test.in liccheck==0.9.2 # via -r ./test.in -lxml==5.3.1 - # via - # -c ./base.txt - # label-studio-sdk mccabe==0.7.0 # via flake8 -multidict==6.2.0 - # via yarl mypy==1.15.0 # via -r ./test.in mypy-extensions==1.0.0 @@ -102,47 +47,29 @@ mypy-extensions==1.0.0 # -c ./base.txt # black # mypy -nltk==3.9.1 - # via - # -c ./base.txt - # label-studio-sdk -numpy==2.0.2 - # via - # -c ./base.txt - # pandas packaging==24.2 # via # -c ./base.txt # black # pytest -pandas==2.2.3 - # via label-studio-sdk pathspec==0.12.1 # via black -pillow==11.1.0 - # via label-studio-sdk -platformdirs==4.3.6 +platformdirs==4.3.7 # via black pluggy==1.5.0 # via pytest -propcache==0.3.0 - # via yarl pycodestyle==2.12.1 # via # flake8 # flake8-print pydantic==2.10.6 - # via - # -r ./test.in - # label-studio-sdk + # via -r ./test.in pydantic-core==2.27.2 # via pydantic pyflakes==3.2.0 # via # autoflake # flake8 -pyrsistent==0.20.0 - # via jsonschema pytest==8.3.5 # via # pytest-cov @@ -155,35 +82,14 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # freezegun - # pandas -pytz==2025.1 - # via pandas -pyyaml==6.0.2 - # via vcrpy -regex==2024.11.6 - # via - # -c ./base.txt - # nltk -requests==2.32.3 - # via - # -c ./base.txt - # label-studio-sdk - # requests-mock -requests-mock==1.12.1 - # via label-studio-sdk -ruff==0.11.0 +ruff==0.11.2 # via -r ./test.in semantic-version==2.10.0 # via liccheck six==1.17.0 # via # -c ./base.txt - # jsonschema # python-dateutil -sniffio==1.3.1 - # via - # -c ./base.txt - # anyio toml==0.10.2 # via liccheck tomli==2.2.1 @@ -193,13 +99,9 @@ tomli==2.2.1 # coverage # mypy # pytest -tqdm==4.67.1 - # via - # -c ./base.txt - # nltk types-click==7.1.8 # via -r ./test.in -types-markdown==3.7.0.20241204 +types-markdown==3.7.0.20250322 # via -r ./test.in types-requests==2.31.0.6 # via -r ./test.in @@ -207,36 +109,10 @@ types-tabulate==0.9.0.20241207 # via -r ./test.in types-urllib3==1.26.25.14 # via types-requests -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt - # anyio # black - # label-studio-sdk - # multidict # mypy # pydantic # pydantic-core -tzdata==2025.1 - # via pandas -ujson==5.10.0 - # via label-studio-sdk -urllib3==1.26.20 - # via - # -c ././deps/constraints.txt - # -c ./base.txt - # requests - # vcrpy -vcrpy==7.0.0 - # via -r ./test.in -wrapt==1.17.2 - # via - # -c ./base.txt - # vcrpy -xmljson==0.2.1 - # via label-studio-sdk -yarl==1.18.3 - # via vcrpy - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/test_unstructured/staging/test_label_studio.py b/test_unstructured/staging/test_label_studio.py index 6d3be972b7..11ca79d064 100644 --- a/test_unstructured/staging/test_label_studio.py +++ b/test_unstructured/staging/test_label_studio.py @@ -1,11 +1,6 @@ from __future__ import annotations -import logging -import re - import pytest -import vcr -from label_studio_sdk import Client from test_unstructured.unit_utils import assign_hash_ids from unstructured.documents.elements import Element, NarrativeText, Title @@ -17,62 +12,6 @@ def elements(): return [Title(text="Title 1"), NarrativeText(text="Narrative 1")] -@vcr.use_cassette( - "test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml", - allow_playback_repeats=True, -) -def test_upload_label_studio_data_with_sdk( - caplog: pytest.LogCaptureFixture, elements: list[Element] -): - """ - Testing Instructions - ==================== - 1. Remove file `test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml`, - which will be recreated later. - 2. Install the label-studio package by running command `pip install -U label-studio`. - 3. Run command `label-studio`, and login or set up label studio account on pop-up website. - 4. Update `LABEL_STUDIO_URL` and `API_KEY` below, you can find your API_KEY by - clicking into your account profile. - 5. Run this test once, and VCR will record the HTTP request to the yaml file. - 6. Kill the label studio instance and run the test again, VCR will replay the response. - """ - log = logging.getLogger("urllib3") - log.setLevel(logging.DEBUG) - # Define the URL where Label Studio is accessible - LABEL_STUDIO_URL = "http://localhost:8080" - # API_KEY is a temporary key from local install not actually valid anywhere - # Update it if the vcr cassette is updated with the API key from your user account - API_KEY = "7b613506d5afa062fe33c9cd825f106c718b82a0" - # Connect to the Label Studio API and check the connection - ls = Client(url=LABEL_STUDIO_URL, api_key=API_KEY) - ls.check_connection() - ls.delete_all_projects() - # Create a sample project to classify types of texts - project = ls.start_project( - title="Text Type Classifications", - label_config=""" - - - -
- - - - - - - """, - ) - label_studio_data = label_studio.stage_for_label_studio(elements) - project.import_tasks(label_studio_data) - # Check success status code (201) for posting tasks job in logger info - success_posting_tasks_status = re.compile(r"POST /api/projects/.*/import.*201") - assert bool(success_posting_tasks_status.search(caplog.text)) - - def test_convert_to_label_studio_data(elements: list[Element]): label_studio_data = label_studio.stage_for_label_studio(elements) diff --git a/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml b/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml deleted file mode 100644 index bf4f22255c..0000000000 --- a/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml +++ /dev/null @@ -1,414 +0,0 @@ -interactions: -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - User-Agent: - - python-requests/2.28.0 - method: GET - uri: http://localhost:8080/api/version - response: - body: - string: '{"release": "1.7.3", "label-studio-os-package": {"version": "1.7.3", - "short_version": "1.7", "latest_version_from_pypi": "1.7.3", "latest_version_upload_time": - "2023-04-19T12:05:18", "current_version_is_outdated": false}, "label-studio-os-backend": - {"message": "Merge pull request #2612 from laggardkernel/bugfix/realpath-in-version - ...", "commit": "fcd7806529ea60cf5e56c782345ced04659d018d", "date": "2023/02/06 - 20:09:22", "branch": "master", "version": "2.3.12+10.gfcd78065"}, "label-studio-frontend": - {"message": "fix: LSDV-4692: Brush segmentation is not supported", "commit": - "f08871a3e70026b12cad502552251db1fba1619e", "branch": "master", "date": "2023/03/29 - 14:40:33"}, "dm2": {"message": "fix: LSDV-4746-1: Only include limited fields - for project when polling", "commit": "9aa96a97e9bcb4154838249dc721efbc724198b7", - "branch": "master", "date": "2023/03/13 15:43:21"}, "label-studio-converter": - {"version": "0.0.51"}}' - headers: - Content-Language: - - en-us - Content-Length: - - '924' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:17:59 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgF:YW6N1NblXlgyM_81UyYNBkcxIWjokDRdWetCeeQfDgA; - expires=Thu, 15 Jun 2023 21:17:59 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 200 - message: OK -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgF:YW6N1NblXlgyM_81UyYNBkcxIWjokDRdWetCeeQfDgA - User-Agent: - - python-requests/2.28.0 - method: GET - uri: http://localhost:8080/health - response: - body: - string: '{"status": "UP"}' - headers: - Content-Language: - - en-us - Content-Length: - - '16' - Content-Type: - - text/html; charset=utf-8 - Date: - - Thu, 01 Jun 2023 21:18:00 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgG:wG1DT2Iz8ZHJlPxwIMum_NVMweQyXE7bbbbiX0tNCuQ; - expires=Thu, 15 Jun 2023 21:18:00 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 200 - message: OK -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgG:wG1DT2Iz8ZHJlPxwIMum_NVMweQyXE7bbbbiX0tNCuQ - User-Agent: - - python-requests/2.28.0 - method: GET - uri: http://localhost:8080/api/projects?page_size=10000000 - response: - body: - string: '{"count":1,"next":null,"previous":null,"results":[{"id":23,"title":"Text - Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T18:31:12.795409Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential - sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":0}]}' - headers: - Allow: - - GET, POST, HEAD, OPTIONS - Content-Language: - - en-us - Content-Length: - - '2033' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:18:01 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU; - expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 200 - message: OK -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU - User-Agent: - - python-requests/2.28.0 - method: GET - uri: http://localhost:8080/api/projects/23 - response: - body: - string: '{"id":23,"title":"Text Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T18:31:12.795409Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential - sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":0}' - headers: - Allow: - - GET, PUT, PATCH, DELETE, HEAD, OPTIONS - Content-Language: - - en-us - Content-Length: - - '1981' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:18:01 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU; - expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 200 - message: OK -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU - User-Agent: - - python-requests/2.28.0 - method: GET - uri: http://localhost:8080/api/projects/23 - response: - body: - string: '{"id":23,"title":"Text Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T18:31:12.795409Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential - sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":0}' - headers: - Allow: - - GET, PUT, PATCH, DELETE, HEAD, OPTIONS - Content-Language: - - en-us - Content-Length: - - '1981' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:18:01 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU; - expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 200 - message: OK -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Content-Length: - - '0' - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU - User-Agent: - - python-requests/2.28.0 - method: DELETE - uri: http://localhost:8080/api/projects/23/ - response: - body: - string: '' - headers: - Allow: - - GET, PUT, PATCH, DELETE, HEAD, OPTIONS - Content-Language: - - en-us - Content-Length: - - '0' - Date: - - Thu, 01 Jun 2023 21:18:01 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU; - expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 204 - message: No Content -- request: - body: '{"title": "Text Type Classifications", "label_config": "\n \n \n \n
\n \n \n \n \n \n \n "}' - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Content-Length: - - '591' - Content-Type: - - application/json - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU - User-Agent: - - python-requests/2.28.0 - method: POST - uri: http://localhost:8080/api/projects - response: - body: - string: '{"id":24,"title":"Text Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T21:18:01.964955Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":null,"task_number":null,"useful_annotation_number":null,"ground_truth_number":null,"skipped_annotations_number":null,"total_annotations_number":null,"total_predictions_number":null,"sampling":"Sequential - sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":null}' - headers: - Allow: - - GET, POST, HEAD, OPTIONS - Content-Language: - - en-us - Content-Length: - - '2005' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:18:02 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgI:Y2nBj16y8Buj0irVpJeFn0fNguq_rXv9BmdK5o64fsw; - expires=Thu, 15 Jun 2023 21:18:02 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 201 - message: Created -- request: - body: '[{"data": {"text": "Title 1", "ref_id": "ab03af41c2940e7584b62df48a964db3"}}, - {"data": {"text": "Narrative 1", "ref_id": "ff9eb806beb1f483322f6fbda680b08b"}}]' - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Content-Length: - - '158' - Content-Type: - - application/json - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgI:Y2nBj16y8Buj0irVpJeFn0fNguq_rXv9BmdK5o64fsw - User-Agent: - - python-requests/2.28.0 - method: POST - uri: http://localhost:8080/api/projects/24/import?return_task_ids=1 - response: - body: - string: '{"task_count":2,"annotation_count":0,"prediction_count":0,"duration":0.1579442024230957,"file_upload_ids":[],"could_be_tasks_list":false,"found_formats":[],"data_columns":[],"task_ids":[1,2]}' - headers: - Allow: - - POST, OPTIONS - Content-Language: - - en-us - Content-Length: - - '191' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:18:02 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgI:Y2nBj16y8Buj0irVpJeFn0fNguq_rXv9BmdK5o64fsw; - expires=Thu, 15 Jun 2023 21:18:02 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 201 - message: Created -version: 1 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index af66a65e41..033f217a2c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.3" # pragma: no cover +__version__ = "0.17.4" # pragma: no cover diff --git a/unstructured/staging/label_studio.py b/unstructured/staging/label_studio.py index 407edcf386..bdb3989e03 100644 --- a/unstructured/staging/label_studio.py +++ b/unstructured/staging/label_studio.py @@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Union from unstructured.documents.elements import Element +from unstructured.logger import logger LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]] @@ -118,6 +119,11 @@ def stage_for_label_studio( ) -> LABEL_STUDIO_TYPE: """Converts the document to the format required for upload to LabelStudio. ref: https://labelstud.io/guide/tasks.html#Example-JSON-format""" + # NOTE(alan): The background for this is that we test this function with the package + # label_studio_sdk, and we're stuck on a version with a high CVE unless we drop to version 1 of + # numpy. The least bad way forward was to deprecate the function, remove the test, and drop the + # dependency. + logger.warning("This function is deprecated, and is unlikely to be maintained in the future.") if annotations is not None and len(elements) != len(annotations): raise ValueError("The length of elements and annotations must match.") if predictions is not None and len(elements) != len(predictions): From 9a239fa18b5bc55bfc81029160c00b0265b9d4b0 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Thu, 27 Mar 2025 13:41:11 -0500 Subject: [PATCH 04/40] build: remove test and dev deps from docker image (#3969) Removed the dependencies contained in `test.txt`, `dev.txt`, and `constraints.txt` from the things that get installed in the docker image. In order to keep testing the image (running the tests), I added a step to the `docker-test` make target to install `test.txt` and `dev.txt`. Thus we presumably get a smaller image (probably not much smaller), reduce the dependency chain or our images, and have less exposure to vulnerabilities while still testing as robustly as before. Incidentally, I removed the `Dockerfile` for our ubuntu image, since it made reference to non-existent make targets, which tells me it's stale and wasn't being used. ### Review: - Reviewer should ensure the dev and test dependencies are not being installed in the docker image. One way to check is to check the logs in CI, and note, e.g. that [this](https://github.com/Unstructured-IO/unstructured/actions/runs/14112971425/job/39536304012#step:3:1700) is the first reference to `pytest` in the docker build and test logs, after the image build is completed. - Reviewer should ensure docker image is still being tested in CI and is passing. --- CHANGELOG.md | 10 ++++++++++ Dockerfile | 2 +- Makefile | 3 ++- docker/rockylinux-9.2/Dockerfile | 2 +- docker/ubuntu-22/Dockerfile | 26 -------------------------- unstructured/__version__.py | 2 +- 6 files changed, 15 insertions(+), 30 deletions(-) delete mode 100644 docker/ubuntu-22/Dockerfile diff --git a/CHANGELOG.md b/CHANGELOG.md index 875f098612..17cb66d3a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.17.5 + +### Enhancements +- **Remove test and dev dependencies from docker image.** This reduces the docker image size slightly and reduces potential security vulnerabilities. + +### Features + +### Fixes +- **Removed out of date ubuntu Dockerfile.** The Dockerfile was out of date and non-functional. + ## 0.17.4 ### Enhancements diff --git a/Dockerfile b/Dockerfile index 69b96d3e67..e4d7ebd5be 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ ENV TESSDATA_PREFIX=/usr/local/share/tessdata ENV NLTK_DATA=/home/notebook-user/nltk_data # Install Python dependencies and download required NLTK packages -RUN find requirements/ -type f -name "*.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \ +RUN find requirements/ -type f -name "*.txt" ! -name "test.txt" ! -name "dev.txt" ! -name "constraints.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \ mkdir -p ${NLTK_DATA} && \ $PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \ $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \ diff --git a/Makefile b/Makefile index c5208c365c..80600a051a 100644 --- a/Makefile +++ b/Makefile @@ -310,7 +310,8 @@ docker-test: -v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \ $(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \ $(DOCKER_IMAGE) \ - bash -c "CI=$(CI) \ + bash -c "pip install -r requirements/test.txt -r requirements/dev.txt && \ + CI=$(CI) \ UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \ python3 -m pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)" diff --git a/docker/rockylinux-9.2/Dockerfile b/docker/rockylinux-9.2/Dockerfile index 3bce864e37..051294dc96 100644 --- a/docker/rockylinux-9.2/Dockerfile +++ b/docker/rockylinux-9.2/Dockerfile @@ -22,7 +22,7 @@ COPY requirements requirements RUN python3.10 -m pip install pip==${PIP_VERSION} && \ dnf -y groupinstall "Development Tools" && \ - find requirements/ -type f -name "*.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \ + find requirements/ -type f -name "*.txt" ! -name "test.txt" ! -name "dev.txt" ! -name "constraints.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \ dnf -y groupremove "Development Tools" && \ dnf clean all diff --git a/docker/ubuntu-22/Dockerfile b/docker/ubuntu-22/Dockerfile deleted file mode 100644 index 059bfc85bb..0000000000 --- a/docker/ubuntu-22/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# Dockerfile that approximates the CI image -# -# Mainly useful for updating test-ingest fixtures - -FROM ubuntu:22.04 - -COPY scripts/setup_ubuntu.sh scripts/setup_ubuntu.sh - -RUN bash scripts/setup_ubuntu.sh root - -COPY requirements/ requirements/ -COPY Makefile Makefile - -SHELL ["/bin/bash", "-c"] - -RUN source ~/.bashrc && pyenv virtualenv 3.10 unstructured && \ - source ~/.pyenv/versions/unstructured/bin/activate && \ - make install-ci && \ - make install-ingest-s3 && \ - make install-ingest-azure && \ - make install-ingest-github && \ - make install-ingest-gitlab && \ - make install-ingest-wikipedia && \ - make install-ingest-discord && \ - make install install-ingest-slack && \ - make install-ingest-confluence diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 033f217a2c..b243ca7861 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.4" # pragma: no cover +__version__ = "0.17.5" # pragma: no cover From 19fc1fcc72c30dc3618da23dab9bc1c4da79e15a Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Mon, 31 Mar 2025 09:45:01 -0700 Subject: [PATCH 05/40] feat: convenience unstructured-get-json.sh update (#3971) * script now supports: * the --vlm flag, to process the document with the VLM strategy * optionally takes --vlm-model, --vlm-provider args * optionally also writes .html outputs by converting unstructured .json output * optionally opens those .html outputs in a browser Tested with: ``` unstructured-get-json.sh --write-html --open-html --fast layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --hi-res layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --ocr-only layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm --vlm-provider openai --vlm-model gpt-4o layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm --vlm-provider vertexai --vlm-model gemini-2.0-flash-001 layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm --vlm-provider anthropic --vlm-model claude-3-5-sonnet-20241022 layout-parser-paper-p2.pdf ``` [layout-parser-paper-p2.pdf](https://github.com/user-attachments/files/19514007/layout-parser-paper-p2.pdf) --- .gitignore | 3 +- CHANGELOG.md | 8 ++ scripts/user/unstructured-get-json.sh | 118 ++++++++++++++++++++++++++ unstructured/__version__.py | 2 +- 4 files changed, 129 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index e8e4471465..87f4fc72bd 100644 --- a/.gitignore +++ b/.gitignore @@ -208,4 +208,5 @@ outputhtmldiff.txt metricsdiff.txt # analysis -annotated/ \ No newline at end of file +annotated/ +.aider* diff --git a/CHANGELOG.md b/CHANGELOG.md index 17cb66d3a6..ad3afdfc3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.17.6-dev0 + +### Enhancements + +### Features + +### Fixes + ## 0.17.5 ### Enhancements diff --git a/scripts/user/unstructured-get-json.sh b/scripts/user/unstructured-get-json.sh index 74ea031390..2ef0ac4eff 100755 --- a/scripts/user/unstructured-get-json.sh +++ b/scripts/user/unstructured-get-json.sh @@ -16,12 +16,20 @@ Options: --hi-res hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR --fast fast strategy: No OCR, just extract embedded text --ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation. + --vlm vlm strategy: Use Vision Language Model for processing + --vlm-provider Specify the VLM model provider + (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy) + --vlm-model Specify the VLM model when using + (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy) --tables Enable table extraction: tables are represented as html in metadata --images Include base64images in json --coordinates Include coordinates in the output --trace Enable trace logging for debugging, useful to cut and paste the executed curl call --verbose Enable verbose logging including printing first 8 elements to stdout --s3 Write the resulting output to s3 (like a pastebin) + --write-html Convert JSON output to HTML. Set the env var $UNST_WRITE_HTML to skip providing this option. + --open-html Automatically open HTML output in browser (macOS only) if --write-html. + Set the env var UNST_AUTO_OPEN_HTML=true to skip providing this option. --help Display this help and exit. @@ -64,6 +72,7 @@ copy_to_clipboard() { HI_RES=false FAST=false OCR_ONLY=false +VLM=false STRATEGY="" VERBOSE=false TRACE=false @@ -72,6 +81,10 @@ FREEMIUM=false TABLES=true IMAGES=false S3="" +WRITE_HTML=${UNST_WRITE_HTML:-false} +OPEN_HTML=${UNST_AUTO_OPEN_HTML:-false} +VLM_PROVIDER="" +VLM_MODEL="" while [[ "$#" -gt 0 ]]; do case "$1" in @@ -87,6 +100,28 @@ while [[ "$#" -gt 0 ]]; do OCR_ONLY=true shift ;; + --vlm) + VLM=true + shift + ;; + --vlm-provider) + if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then + VLM_PROVIDER=$2 + shift 2 + else + echo "Error: Argument for $1 is missing" >&2 + exit 1 + fi + ;; + --vlm-model) + if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then + VLM_MODEL=$2 + shift 2 + else + echo "Error: Argument for $1 is missing" >&2 + exit 1 + fi + ;; --trace) TRACE=true shift @@ -99,6 +134,14 @@ while [[ "$#" -gt 0 ]]; do S3=true shift ;; + --write-html) + WRITE_HTML=true + shift + ;; + --open-html) + OPEN_HTML=true + shift + ;; --tables) TABLES=true shift @@ -140,6 +183,24 @@ if [ -z "$INPUT" ]; then exit 1 fi +# Check for strategy conflicts after all arguments are processed +STRATEGY_COUNT=0 +$HI_RES && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) +$FAST && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) +$OCR_ONLY && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) +$VLM && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) + +if [ "$STRATEGY_COUNT" -gt 1 ]; then + echo "Error: Only one strategy option (--hi-res, --fast, --ocr-only, --vlm) can be specified at a time." + exit 1 +fi + +# Check if vlm-provider or vlm-model are provided without --vlm +if { [ -n "$VLM_PROVIDER" ] || [ -n "$VLM_MODEL" ]; } && ! $VLM; then + echo "Error: --vlm-provider or --vlm-model can only be used with --vlm strategy." + exit 1 +fi + if $TRACE; then set -x fi @@ -175,6 +236,25 @@ elif $OCR_ONLY; then STRATEGY="-ocr-only" JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json CURL_STRATEGY=(-F "strategy=ocr_only") +elif $VLM; then + if $VERBOSE; then echo "Sending API request with vlm strategy"; fi + STRATEGY="-vlm" + # Add provider and model to filename if specified + if [ -n "$VLM_PROVIDER" ] && [ -n "$VLM_MODEL" ]; then + STRATEGY="-vlm-${VLM_PROVIDER}-${VLM_MODEL}" + elif [ -n "$VLM_PROVIDER" ]; then + STRATEGY="-vlm-${VLM_PROVIDER}" + elif [ -n "$VLM_MODEL" ]; then + STRATEGY="-vlm-model-${VLM_MODEL}" + fi + JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json + CURL_STRATEGY=(-F "strategy=vlm") + if [ -n "$VLM_PROVIDER" ]; then + CURL_STRATEGY+=(-F "vlm_model_provider=$VLM_PROVIDER") + fi + if [ -n "$VLM_MODEL" ]; then + CURL_STRATEGY+=(-F "vlm_model=$VLM_MODEL") + fi else if $VERBOSE; then echo "Sending API request WITHOUT a strategy"; fi JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json @@ -213,6 +293,44 @@ else fi echo "JSON Output file: ${JSON_OUTPUT_FILEPATH}" +# Convert JSON to HTML if requested +if [ "$WRITE_HTML" = true ]; then + HTML_OUTPUT_FILEPATH=${JSON_OUTPUT_FILEPATH%.json}.html + + if $VLM; then + # VLM output has all metadata.text_as_html fields defined, so + # create HTML directly from the metadata.text_as_html fields + { + echo "" + echo "" + echo "" + echo " " + echo " " + echo " Codestin Search App" + echo " " + echo "" + echo "" + jq -r 'map(.metadata.text_as_html) | join("\n")' "${JSON_OUTPUT_FILEPATH}" + echo "" + echo "" + } >"${HTML_OUTPUT_FILEPATH}" + echo "HTML written directly from metadata.text_as_html fields to: ${HTML_OUTPUT_FILEPATH}" + else + # most elements will not have metadata.text_as_html defined (by design on Table elements do), + # so use the unstructured library's python script for the conversion. + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../html/elements_json_to_html.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}" + echo "HTML written using Python script to: ${HTML_OUTPUT_FILEPATH}" + fi + + # Open HTML file in browser if requested and on macOS + if [ "$OPEN_HTML" = true ] && [ "$(uname)" == "Darwin" ]; then + open "${HTML_OUTPUT_FILEPATH}" + fi +fi + # write .json output to s3 location if [ -n "$S3" ]; then diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b243ca7861..db302d22ce 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.5" # pragma: no cover +__version__ = "0.17.6-dev0" # pragma: no cover From c6b8ed4290891b997a40f4477151a3353753d07e Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Mon, 31 Mar 2025 22:18:57 -0700 Subject: [PATCH 06/40] chore: allow changing default output dir for unstructured-get-json.sh (#3973) --- scripts/user/unstructured-get-json.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/user/unstructured-get-json.sh b/scripts/user/unstructured-get-json.sh index 2ef0ac4eff..4fb21263a3 100755 --- a/scripts/user/unstructured-get-json.sh +++ b/scripts/user/unstructured-get-json.sh @@ -50,8 +50,8 @@ fi IMAGE_BLOCK_TYPES=${IMAGE_BLOCK_TYPES:-'"image", "table"'} API_KEY=${UNST_API_KEY:-""} -TMP_DOWNLOADS_DIR="$HOME/tmp/unst-downloads" -TMP_OUTPUTS_DIR="$HOME/tmp/unst-outputs" +TMP_DOWNLOADS_DIR=${UNST_SCRIPT_DOWNLOADS_DIR:-"$HOME/tmp/unst-downloads"} +TMP_OUTPUTS_DIR=${UNST_SCRIPT_JSON_OUTPUTS_DIR:-"$HOME/tmp/unst-outputs"} # only applicable if writing .json output files to S3 when using --s3, e.g. s3://bucket-name/path/ S3_URI_PREFIX=${UNST_S3_JSON_OUTPUT_URI:-""} # e.g. us-east-2, used to provide http links for above location From 8fc41811eb1d425a772b028b1cb01a4d6c90a788 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 3 Apr 2025 15:42:25 -0700 Subject: [PATCH 07/40] chore: add html path to ingest-test-fixtures-update-pr (#3977) This should allow the `Ingest Test Fixtures Update PR` workflow to also update expected html outputs. E.g., before the change, the .html files would be left unmodified: ![image](https://github.com/user-attachments/assets/fa14c1a5-39bd-4e32-b4b9-9552eb312de1) https://github.com/Unstructured-IO/unstructured/actions/runs/14234877547/job/39892334672 --- .github/workflows/ingest-test-fixtures-update-pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 317f46ec0c..33402ae260 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -139,6 +139,7 @@ jobs: token: ${{ secrets.GH_CREATE_PR_TOKEN }} add-paths: | test_unstructured_ingest/expected-structured-output + test_unstructured_ingest/expected-structured-output-html test_unstructured_ingest/metrics commit-message: "Update ingest test fixtures" branch: ${{ env.BRANCH_NAME }} From dfa17bd3a0c476dce571b8b493dd2ff80ddaebc1 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Fri, 4 Apr 2025 14:38:23 -0700 Subject: [PATCH 08/40] fix: hi_res PDF parsing: only uncategorized text for extracted elements (#3975) --- CHANGELOG.md | 3 +- .../partition/pdf_image/test_pdf.py | 4 +- test_unstructured/partition/test_msg.py | 2 +- .../biomed-api/65/11/main.PMC6312790.pdf.html | 30 +++---- .../biomed-api/75/29/main.PMC6312793.pdf.html | 28 +++--- .../07/07/sbaa031.073.PMC7234218.pdf.html | 4 +- .../recalibrating-risk-report.pdf.html | 86 +++++++++---------- .../layout-parser-paper-with-table.jpg.html | 4 +- .../layout-parser-paper.pdf.html | 54 ++++++------ .../biomed-api/65/11/main.PMC6312790.pdf.json | 20 ++--- .../biomed-api/75/29/main.PMC6312793.pdf.json | 18 ++-- .../07/07/sbaa031.073.PMC7234218.pdf.json | 2 +- .../recalibrating-risk-report.pdf.json | 44 +++++----- .../layout-parser-paper-with-table.jpg.json | 2 +- .../layout-parser-paper.pdf.json | 30 +++---- unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 5 +- 17 files changed, 171 insertions(+), 167 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad3afdfc3e..baa69aae9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,11 @@ -## 0.17.6-dev0 +## 0.17.6-dev1 ### Enhancements ### Features ### Fixes +- **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) ## 0.17.5 diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 6d1145eb80..7a0c8ff29c 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -823,8 +823,8 @@ def test_partition_categorization_backup(): example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES, ) - # Should have changed the element class from Text to Title - assert isinstance(elements[0], Title) + # Should NOT have changed the element class from Text to Title + assert isinstance(elements[0], Text) assert elements[0].text == text diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index d1d66876ed..94b12d5578 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -141,7 +141,7 @@ def test_partition_msg_can_process_attachments(): "Text", "Text", "Image", - "Title", + "Text", "Text", "Title", "Title", diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html index a55cccdbbd..210109c06e 100644 --- a/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html @@ -14,9 +14,9 @@

Contents lists available at ScienceDirect

-

+

Data in Brief -

+

journal homepage: www.elsevier.com/locate/dib

@@ -28,19 +28,19 @@

Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment

-

+

(Jee -

+

Omotayo Sanni n, Abimbola Patricia I. Popoola

Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa

-

+

a r t i c l e i n f o

-

+

a b s t r a c t

@@ -88,19 +88,19 @@

Value of the data

-

+

© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel

  • Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment.
  • -

    +

    © The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316

  • can be used as basis in determining the inhibitive performance of the same inhibitor in other environments.
  • -

    +

    © The data can be used to examine the relationship between the process variable as it affect the

  • @@ -152,9 +152,9 @@

    Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 —0.9393 0.0003 24.0910 2.8163 2 1.9460 0.0596 —0.8276 0.0002 121.440 1.5054 4 0.0163 0.2369 —0.8825 0.0001 42.121 0.9476 6 0.3233 0.0540 —0.8027 5.39E-05 373.180 0.4318 8 0.1240 0.0556 —0.5896 5.46E-05 305.650 0.3772 10 0.0382 0.0086 —0.5356 1.24E-05 246.080 0.0919
    -

    +

    rate (mm/year) -

    +

    The plot of inhibitor concentration over degree of surface coverage versus inhibitor concentration gives a straight line as shown in Fig. 5. The strong correlation reveals that egg shell adsorption on stainless surface in 0.5 M H2SO4 follow Langmuir adsorption isotherm. Figs. 6–8 show the SEM/EDX surface morphology analysis of stainless steel. Figs. 7 and 8 are the SEM/EDX images of the stainless steel specimens without and with inhibitor after weight loss experiment in sulphuric acid medium. The stainless steel surface corrosion product layer in the absence of inhibitor was porous and as a result gives no corrosion protection. With the presence of ES, corrosion damage was minimized, with an evidence of ES present on the metal surface as shown in Fig. 8.

    @@ -232,12 +232,12 @@

    The potentiodynamic polarization method was performed on the prepared test samples immersed in 0.5 M H2SO4 solution in the presence and absence of different ES concentrations. A three electrode system was used; stainless steel Type 316 plate as working electrode with an exposed area of 1.0 cm2, platinum rod as counter electrode and silver chloride electrode as reference electrode. The electrode was polished, degreased in acetone and thoroughly rinsed with distilled water before the experiment. Current density against applied potential was plotted. The slope of the linear part in anodic and cathodic plots gives anodic and cathodic constants according to the Stern–Geary equation, and the

    -

    +

    ð2à -

    -

    +

    +

    ð3à -

    +

    O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457
    diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html index bb95afd2b2..aabc7233cc 100644 --- a/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html @@ -14,9 +14,9 @@

    Contents lists available at ScienceDirect

    -

    +

    Data in Brief -

    +

    journal homepage: www.elsevier.com/locate/dib

    @@ -28,9 +28,9 @@

    A benchmark dataset for the multiple depot vehicle scheduling problem

    -

    +

    (eee -

    +

    Sarang Kulkarni a,b,c,n, Mohan Krishnamoorthy d,e, Abhiram Ranade f, Andreas T. Ernst c, Rahul Patil b

    @@ -52,16 +52,16 @@

    e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072,

    -

    +

    Australia -

    +

    f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India

    -

    +

    a r t i c l e i n f o

    -

    +

    a b s t r a c t

    @@ -106,13 +106,13 @@

  • © The data provide all the information that is required to model the MDVSP by using the existing mathematical formulations.
  • -

    +

    e All the problem instances are available for use without any restrictions.

  • e The benchmark solutions and solution time for the problem instances are presented in [3] and can be used for the comparison.
  • -

    +

    © The dataset includes a program that can generate similar problem instances of different sizes.

    @@ -121,9 +121,9 @@

    The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate ï¬le. Each ï¬le is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number for the size, â€˜Ă°m;nĂ’, respectively. For example, the problem instance, ‘RN-8–1500-01.dat’, is the ï¬rst problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm;nĂ, ï¬ve instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. For each problem instance, the following information is provided:

    -

    +

    The number of depots mð -

    +

    Ă,

    @@ -187,9 +187,9 @@

    Instance size (m, n) Average number of Locations Times Vehicles (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 (16, 3000) 1087.20 1101.60 1284.60 2,684,983.60
    -

    +

    Possible empty travels -

    +

    S. Kulkarni et al. / Data in Brief 22 (2019) 484–487
    diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html index 0862a71a27..eabce53c29 100644 --- a/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html @@ -76,8 +76,8 @@

    Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2,

    -

    +

    AQ3 -

    +

    diff --git a/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html b/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html index c17be23f5a..517f7a3608 100644 --- a/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html @@ -11,7 +11,7 @@

    WORLD ASSOCIATION

    -

    +

    Recalibrating risk

    @@ -89,69 +89,69 @@

    In terms of accidents, hydropower is the deadliest electricity generator, mostly due to collapsing dams and the consequences of flooding. The Banqiao Dam failure in 1975 led to at least 26,000 people drowning, and as many as 150,000 deaths resulting from the secondary effects of the accident. In comparison, radiation exposure following Chernobyl caused 54 deaths2, while no casualties due to radiation are likely to occur from the accident at Fukushima Daiichi.

    25  24.6  20  18.4  e  15  10  5  4.6  2.8  0  Coal  Oil  Bio m ass  Natural gas  0.07  Wind  0.04  Hydropower  0.02  Solar  0.01  Nuclear -

    +

    r -

    -

    +

    +

    a -

    -

    +

    +

    e -

    -

    +

    +

    y -

    -

    +

    +

    W -

    -

    +

    +

    T -

    -

    +

    +

    r -

    -

    +

    +

    e -

    -

    +

    +

    p -

    -

    +

    +

    s -

    +

    8

    -

    +

    e -

    -

    +

    +

    i -

    -

    +

    +

    t -

    -

    +

    +

    i -

    -

    +

    +

    l -

    -

    +

    +

    S -

    -

    +

    +

    a -

    -

    +

    +

    t -

    -

    +

    +

    a -

    -

    +

    +

    F -

    +

    Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution3

    @@ -251,9 +251,9 @@

  • World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries
  • -

    +

    i -

    +

  • ii BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712
  • diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html index ccc0784c71..dbf342486a 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html @@ -114,9 +114,9 @@

  • import layoutparser as lp
  • -

    +

    wwe -

    +

  • image = cv2.imread("image_file") # load images
  • diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html index 84e2672182..eca4025c8d 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html @@ -22,24 +22,24 @@
    2 n u J 1 2 ] V C . s c [ 2 v 8 4 3 5 1 . 3 0 1 2 :
    -

    +

    v -

    -

    +

    +

    arXiv -

    -

    +

    +

    i -

    -

    +

    +

    X -

    -

    +

    +

    r -

    -

    +

    +

    a -

    +

    LayoutParser: A Uniï¬ed Toolkit for Deep Learning Based Document Image Analysis

    @@ -115,28 +115,28 @@

    -

    +

    7 https://ocr-d.de/en/about -

    -

    +

    +

    8 https://github.com/BobLd/DocumentLayoutAnalysis -

    -

    +

    +

    9 https://github.com/leonlulu/DeepLayout -

    -

    +

    +

    10 https://github.com/hpanwar08/detectron2 -

    -

    +

    +

    11 https://github.com/JaidedAI/EasyOCR -

    -

    +

    +

    12 https://github.com/PaddlePaddle/PaddleOCR -

    +

    4

    -

    +

    Z. Shen et al.

    Efficient Data Annotation Model Customization Document Images Community Platform ‘a >) ¥ DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | ——= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY @@ -263,7 +263,7 @@

    6

    -

    +

    Z. Shen et al.

    - ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff @@ -303,7 +303,7 @@

    LayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained with the Connectionist Temporal Classiï¬cation (CTC) loss [10]. It can be used like the other OCR modules, and can be easily trained on customized datasets.

    -

    +

    13 This is also available in the LayoutParser documentation pages.

  • diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 6f6c30b2a8..c26c406734 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -63,7 +63,7 @@ "page_number": 1 }, "text": "Data in Brief", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "97e80c6e7dc2754c9083b263ff65039e", @@ -148,7 +148,7 @@ "page_number": 1 }, "text": "(Jee", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "bddd1cbc864e9b44cc0715a1cccf8dbc", @@ -187,7 +187,7 @@ "page_number": 1 }, "text": "a r t i c l e i n f o", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "b9e48f235de5b531427187eb6ea135fe", @@ -200,7 +200,7 @@ "page_number": 1 }, "text": "a b s t r a c t", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "911bfead9b546998812e2d1d615ecc87", @@ -432,7 +432,7 @@ "page_number": 2 }, "text": "© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "afed004de4c50d761640b6c18729a988", @@ -458,7 +458,7 @@ "page_number": 2 }, "text": "© The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "cb6e8acb9c24820b59f8973cc236ef35", @@ -484,7 +484,7 @@ "page_number": 2 }, "text": "© The data can be used to examine the relationship between the process variable as it affect the", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "e1f7e635d8739a97d8d0000ba8004f61", @@ -744,7 +744,7 @@ "page_number": 4 }, "text": "rate (mm/year)", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "3a5534c2aafc2d8a4c0b65d530d00ab3", @@ -1134,7 +1134,7 @@ "page_number": 6 }, "text": "ð2Ă", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "cff55ae1916232dbda5239f59c897cb9", @@ -1147,7 +1147,7 @@ "page_number": 6 }, "text": "ð3Ă", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "e40c3ee561b10ca5b7a76900c8d5b263", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index 1fab6122c1..17e0923127 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -63,7 +63,7 @@ "page_number": 1 }, "text": "Data in Brief", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "c1b3d4f53698b892fcc23fc10a72e6fb", @@ -148,7 +148,7 @@ "page_number": 1 }, "text": "(eee", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "0cda4eb20070fdf01ec0d47b2a550241", @@ -252,7 +252,7 @@ "page_number": 1 }, "text": "Australia", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "85875ebbc1de554e92edc54674add1d5", @@ -278,7 +278,7 @@ "page_number": 1 }, "text": "a r t i c l e i n f o", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "4f3f69dd17ddae776c656ec73d9837ae", @@ -291,7 +291,7 @@ "page_number": 1 }, "text": "a b s t r a c t", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "34522460857b10c63d8c2c8d2fbb3087", @@ -534,7 +534,7 @@ "page_number": 2 }, "text": "e All the problem instances are available for use without any restrictions.", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "d401597b8ff2854bfb89f2833d02a763", @@ -560,7 +560,7 @@ "page_number": 2 }, "text": "© The dataset includes a program that can generate similar problem instances of different sizes.", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "fb765d6762e6a423cb8b9dab27359732", @@ -606,7 +606,7 @@ "page_number": 2 }, "text": "The number of depots mð", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "320f6d28582c354d35673c2a4119851f", @@ -892,7 +892,7 @@ "page_number": 3 }, "text": "Possible empty travels", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "fa23407a7c3c99ae3b6fb79034698807", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json index 3641fcd434..67cd5fb088 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json @@ -309,6 +309,6 @@ "page_number": 1 }, "text": "AQ3", - "type": "Title" + "type": "UncategorizedText" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json index 49e17cb5fc..6e7d6aa5f3 100644 --- a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json @@ -186,7 +186,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "7137c1e14141fad3ad306fe68918a967", "text": "Recalibrating risk", "metadata": { @@ -2790,7 +2790,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "a8706e82b3f90cffc996a24348e3b670", "text": "r", "metadata": { @@ -2883,7 +2883,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "da631c23500655c51b9311a61f55744f", "text": "a", "metadata": { @@ -2976,7 +2976,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "d78a11e9e55235934c3a4922053c68e5", "text": "e", "metadata": { @@ -3069,7 +3069,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "8d14df8b7fd7744365fbf8e02d69415a", "text": "y", "metadata": { @@ -3162,7 +3162,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "f4df01bee1b8ffb973ac8539649c5189", "text": "W", "metadata": { @@ -3255,7 +3255,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "b733cf49de269e22bed7c9883b958669", "text": "T", "metadata": { @@ -3348,7 +3348,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "c4b47d788b26c3d5c62ad462ed3ca2db", "text": "r", "metadata": { @@ -3441,7 +3441,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "bff4435574259239761670b31432cc8a", "text": "e", "metadata": { @@ -3534,7 +3534,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "8ba15a3a71eb0bb689c582098cce6730", "text": "p", "metadata": { @@ -3627,7 +3627,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "5fde097ba00ad7647206ae11c721d28c", "text": "s", "metadata": { @@ -3813,7 +3813,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "81f1f3b9da6df38d938bf7871fa069b5", "text": "e", "metadata": { @@ -3906,7 +3906,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "aa4a79651a9a0087b66fcc40a2213113", "text": "i", "metadata": { @@ -3999,7 +3999,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "6d1c0d05d3a424b43d9572188a76c2d4", "text": "t", "metadata": { @@ -4092,7 +4092,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "392a17b2f3eba46f4bcf078e0b204514", "text": "i", "metadata": { @@ -4185,7 +4185,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "d24a9a771e46fdd6b269f1ecaf0b5eec", "text": "l", "metadata": { @@ -4278,7 +4278,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "9dc4537afa8ae0b959a542f9ba5c1e03", "text": "S", "metadata": { @@ -4371,7 +4371,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "919dac2487a4c860747318a132a54a72", "text": "a", "metadata": { @@ -4464,7 +4464,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "04ee5d05c3fcfffd945762e803478600", "text": "t", "metadata": { @@ -4557,7 +4557,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "63dabde368e2cf310d20a885fe50314a", "text": "a", "metadata": { @@ -4650,7 +4650,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "796538927664e4d87312c428469428f5", "text": "F", "metadata": { @@ -8184,7 +8184,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "a95a2add68d668b944cc332c88ea721e", "text": "i", "metadata": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json index 147e62d128..c71cf50967 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json @@ -177,7 +177,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "9d40bf1b2e2af1692f5689a1c44ab2ae", "text": "wwe", "metadata": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index b9d9f35d17..3f42ca335d 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -110,7 +110,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "4608f9aa33a0cab158565817b0d15743", "text": "v", "metadata": { @@ -132,7 +132,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "6f69e5f921907e689f1a52bd84282b31", "text": "arXiv", "metadata": { @@ -154,7 +154,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "ed4e590932b333f40d0e1367b6b0e32e", "text": "i", "metadata": { @@ -176,7 +176,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "8cb024fb60457b7c572b167801037f75", "text": "X", "metadata": { @@ -198,7 +198,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "c202bdacd2daf4c52fa3a6ddd64a0728", "text": "r", "metadata": { @@ -220,7 +220,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "3db474893ec321c81ef9d1a2afd5f660", "text": "a", "metadata": { @@ -1022,7 +1022,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "db639db124b6064248de0c0dc71510a4", "text": "7 https://ocr-d.de/en/about", "metadata": { @@ -1044,7 +1044,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "d881ce84f017d89f6e35e2bc4b133bfc", "text": "8 https://github.com/BobLd/DocumentLayoutAnalysis", "metadata": { @@ -1066,7 +1066,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "9b96c128deddda1a32c739a2df157496", "text": "9 https://github.com/leonlulu/DeepLayout", "metadata": { @@ -1088,7 +1088,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "5cf72e821375f4480a1529bef97608ef", "text": "10 https://github.com/hpanwar08/detectron2", "metadata": { @@ -1110,7 +1110,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "4ab94e79eedc3a7ac498aaf737ca8878", "text": "11 https://github.com/JaidedAI/EasyOCR", "metadata": { @@ -1132,7 +1132,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "460b163c13ad7cad4fce325820a76481", "text": "12 https://github.com/PaddlePaddle/PaddleOCR", "metadata": { @@ -1176,7 +1176,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "92c4289ad4af7c0793e40d5662707e0a", "text": "Z. Shen et al.", "metadata": { @@ -1739,7 +1739,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "710ac103981c6363195774b02ee582d4", "text": "Z. Shen et al.", "metadata": { @@ -2083,7 +2083,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "a2a0a2ef0279f0710f3cd34474ca8645", "text": "13 This is also available in the LayoutParser documentation pages.", "metadata": { diff --git a/unstructured/__version__.py b/unstructured/__version__.py index db302d22ce..1c6678160c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.6-dev0" # pragma: no cover +__version__ = "0.17.6-dev1" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index e0e64854d4..d38658ed64 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -362,7 +362,10 @@ def partition_pdf_or_image( table_ocr_agent=table_ocr_agent, **kwargs, ) - out_elements = _process_uncategorized_text_elements(elements) + # NOTE(crag): do not call _process_uncategorized_text_elements here, because + # extracted elements (which are text blocks outside of OD-determined blocks) + # are likely not Titles and should not be identified as such. + return elements elif strategy == PartitionStrategy.FAST: out_elements = _partition_pdf_with_pdfparser( From d570f4624bb8c5dc75f0009775925fdaa40defb0 Mon Sep 17 00:00:00 2001 From: Philippe PRADOS Date: Mon, 7 Apr 2025 17:57:20 +0200 Subject: [PATCH 09/40] Fix sort_page_element. ensures that sorting is stable and not random. (#3978) The sort_page_element() use the element id to sort the elements. Two executions of the same code, on the same file, produce different results. The order of the elements is random. This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. --- CHANGELOG.md | 3 +++ .../partition/pdf_image/test_pdf.py | 21 +++++++++++++++++++ unstructured/partition/utils/sorting.py | 1 - 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index baa69aae9f..4da58bbc9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ ### Features ### Fixes +- The sort_page_element() use the element id to sort the elements. +Two executions of the same code, on the same file, produce different results. The order of the elements is random. +This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) ## 0.17.5 diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 7a0c8ff29c..70eec35fd7 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1603,3 +1603,24 @@ def test_partition_pdf_with_specified_ocr_agents(mocker): assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT} assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE} + + +def test_reproductible_pdf_loader(): + from glob import glob + + for f in glob(example_doc_path("pdf/layout-parser-paper.pdf")): + elements_1 = pdf.partition_pdf( + filename=f, + strategy=PartitionStrategy.AUTO, + infer_table_structure=False, + ) + for _ in range(4): + elements_2 = pdf.partition_pdf( + filename=f, + strategy=PartitionStrategy.AUTO, + infer_table_structure=False, + ) + for e1, e2 in zip(elements_1, elements_2): + assert e1.text == e2.text, f"load two time {f=} return differents results" + else: + break diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py index 8cdc885dd1..59d550958b 100644 --- a/unstructured/partition/utils/sorting.py +++ b/unstructured/partition/utils/sorting.py @@ -179,7 +179,6 @@ def _coords_ok(strict_points: bool): key=lambda el: ( el.metadata.coordinates.points[0][1] if el.metadata.coordinates else float("inf"), el.metadata.coordinates.points[0][0] if el.metadata.coordinates else float("inf"), - el.id, ), ) else: From 27f503ce3131ee01006205124c2e6484cf0510c5 Mon Sep 17 00:00:00 2001 From: Nathan <168383951+Nathan-GoSupply@users.noreply.github.com> Date: Tue, 8 Apr 2025 17:47:24 +1000 Subject: [PATCH 10/40] Update pdfminer_utils.py (#3974) Fix for 'PSSyntaxError' import error: "cannot import name 'PSSyntaxError' from 'pdfminer.pdfparser'" Latest pdfminer-six doesn't import PSSyntaxError into `pdfminer.pdfparser` anymore. It must now be directly imported from its source (`pdfminer.psexceptions`) --- CHANGELOG.md | 1 + unstructured/partition/pdf_image/pdfminer_utils.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4da58bbc9e..62ae488af3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ This makes it impossible to write stable unit tests, for example, or to obtain r ### Fixes - **Removed out of date ubuntu Dockerfile.** The Dockerfile was out of date and non-functional. +- **Fix for 'PSSyntaxError' import error: "cannot import name 'PSSyntaxError' from 'pdfminer.pdfparser'"** PSSyntaxError needed to be imported from its source 'pdfminer.psexceptions'. ## 0.17.4 diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index ad6f981914..3993f41ae0 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -6,7 +6,7 @@ from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage -from pdfminer.psparser import PSSyntaxError +from pdfminer.psexceptions import PSSyntaxError from pydantic import BaseModel from unstructured.logger import logger From fd9d796797d29648421e56880ee2938b8422c7e5 Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 28 Apr 2025 17:58:05 -0700 Subject: [PATCH 11/40] fix cve (#3989) fix critical cve for h11. supposedly 0.16.0 fixes it. --------- Co-authored-by: Yao You Co-authored-by: Austin Walker Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet --- CHANGELOG.md | 4 +- requirements/base.txt | 22 +- requirements/deps/constraints.txt | 2 + requirements/dev.txt | 6 +- requirements/extra-docx.txt | 4 +- requirements/extra-markdown.txt | 2 +- requirements/extra-odt.txt | 4 +- requirements/extra-paddleocr.txt | 33 +- requirements/extra-pdf-image.txt | 39 +- requirements/extra-pptx.txt | 8 +- requirements/huggingface.txt | 16 +- requirements/ingest/ingest.txt | 2 +- requirements/test.txt | 27 +- ...iomedical-Data-Scientists-2-pages.pdf.json | 4 +- .../azure/IRS-form-1987.pdf.json | 54 +- .../azure/IRS-form-1987.png.json | 30 +- .../azure/spring-weather.html.json | 2 +- .../handbook-1p.docx.json | 22 +- .../multi-column-2p.pdf.json | 14 +- .../fake-html-cp1252.html.json | 2 +- .../layout-parser-paper-with-table.jpg.json | 4 +- .../layout-parser-paper.pdf.json | 184 +- .../UDHR_first_article_all.txt.json | 712 +-- ...iomedical-Data-Scientists-2-pages.pdf.json | 4 +- .../biomed-api/65/11/main.PMC6312790.pdf.json | 4195 ----------------- .../biomed-api/75/29/main.PMC6312793.pdf.json | 2514 ---------- .../07/07/sbaa031.073.PMC7234218.pdf.json | 310 -- .../s3-minio/wiki_movie_plots_small.csv.json | 4 +- test_unstructured_ingest/src/against-api.sh | 5 +- test_unstructured_ingest/src/airtable-diff.sh | 5 +- .../src/airtable-large.sh | 5 +- test_unstructured_ingest/src/astradb.sh | 5 +- test_unstructured_ingest/src/azure.sh | 5 +- test_unstructured_ingest/src/biomed-api.sh | 5 +- test_unstructured_ingest/src/biomed-path.sh | 5 +- test_unstructured_ingest/src/box.sh | 9 +- .../src/confluence-diff.sh | 5 +- .../src/confluence-large.sh | 5 +- test_unstructured_ingest/src/delta-table.sh | 5 +- test_unstructured_ingest/src/discord.sh | 5 +- test_unstructured_ingest/src/dropbox.sh | 5 +- test_unstructured_ingest/src/elasticsearch.sh | 5 +- test_unstructured_ingest/src/gcs.sh | 5 +- test_unstructured_ingest/src/github.sh | 9 +- test_unstructured_ingest/src/gitlab.sh | 5 +- test_unstructured_ingest/src/google-drive.sh | 5 +- test_unstructured_ingest/src/hubspot.sh | 5 +- test_unstructured_ingest/src/jira.sh | 5 +- test_unstructured_ingest/src/kafka-local.sh | 5 +- .../src/local-embed-bedrock.sh | 5 +- .../src/local-embed-mixedbreadai.sh | 5 +- .../src/local-embed-octoai.sh | 5 +- .../src/local-embed-vertexai.sh | 5 +- .../src/local-embed-voyageai.sh | 5 +- test_unstructured_ingest/src/local-embed.sh | 5 +- .../src/local-failed-partition.sh | 5 +- .../src/local-single-file-basic-chunking.sh | 5 +- ...ocal-single-file-chunk-no-orig-elements.sh | 5 +- .../src/local-single-file-with-encoding.sh | 5 +- ...gle-file-with-pdf-infer-table-structure.sh | 5 +- .../src/local-single-file.sh | 5 +- test_unstructured_ingest/src/local.sh | 5 +- test_unstructured_ingest/src/mongodb.sh | 5 +- test_unstructured_ingest/src/notion.sh | 5 +- test_unstructured_ingest/src/onedrive.sh | 5 +- test_unstructured_ingest/src/opensearch.sh | 5 +- test_unstructured_ingest/src/outlook.sh | 5 +- .../src/pdf-fast-reprocess.sh | 5 +- .../src/s3-compression.sh | 5 +- test_unstructured_ingest/src/s3-minio.sh | 5 +- test_unstructured_ingest/src/s3.sh | 5 +- test_unstructured_ingest/src/salesforce.sh | 5 +- test_unstructured_ingest/src/sftp.sh | 5 +- .../src/sharepoint-with-permissions.sh | 5 +- test_unstructured_ingest/src/sharepoint.sh | 5 +- test_unstructured_ingest/src/slack.sh | 5 +- test_unstructured_ingest/src/wikipedia.sh | 5 +- test_unstructured_ingest/test-ingest-src.sh | 42 - unstructured/__version__.py | 2 +- 79 files changed, 761 insertions(+), 7760 deletions(-) delete mode 100644 test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json delete mode 100644 test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json delete mode 100644 test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 62ae488af3..ad5dea531f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.17.6-dev1 +## 0.17.6-dev2 ### Enhancements @@ -9,6 +9,8 @@ Two executions of the same code, on the same file, produce different results. The order of the elements is random. This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) +- Resolve open CVEs + ## 0.17.5 diff --git a/requirements/base.txt b/requirements/base.txt index 78fc8ce871..862ed52ff9 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -8,9 +8,9 @@ anyio==4.9.0 # via httpx backoff==2.2.1 # via -r ./base.in -beautifulsoup4==4.13.3 +beautifulsoup4==4.13.4 # via -r ./base.in -certifi==2025.1.31 +certifi==2025.4.26 # via # httpcore # httpx @@ -42,11 +42,11 @@ exceptiongroup==1.2.2 # via anyio filetype==1.2.0 # via -r ./base.in -h11==0.14.0 +h11==0.16.0 # via httpcore html5lib==1.1 # via -r ./base.in -httpcore==1.0.7 +httpcore==1.0.9 # via httpx httpx==0.28.1 # via unstructured-client @@ -62,13 +62,13 @@ jsonpath-python==1.0.6 # via unstructured-client langdetect==1.0.9 # via -r ./base.in -lxml==5.3.1 +lxml==5.4.0 # via -r ./base.in marshmallow==3.26.1 # via # dataclasses-json # unstructured-client -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 # via # typing-inspect # unstructured-client @@ -80,9 +80,9 @@ numpy==2.0.2 # via -r ./base.in olefile==0.47 # via python-oxmsg -orderly-set==5.3.0 +orderly-set==5.4.0 # via deepdiff -packaging==24.2 +packaging==25.0 # via # marshmallow # unstructured-client @@ -100,7 +100,7 @@ python-magic==0.4.27 # via -r ./base.in python-oxmsg==0.0.2 # via -r ./base.in -rapidfuzz==3.12.2 +rapidfuzz==3.13.0 # via -r ./base.in regex==2024.11.6 # via nltk @@ -119,13 +119,13 @@ six==1.17.0 # unstructured-client sniffio==1.3.1 # via anyio -soupsieve==2.6 +soupsieve==2.7 # via beautifulsoup4 tqdm==4.67.1 # via # -r ./base.in # nltk -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -r ./base.in # anyio diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index be1d0c40fd..9659e8bac1 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -22,3 +22,5 @@ importlib-metadata>=8.5.0 unstructured-client>=0.23.0,<0.26.0 # paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file protobuf>=6.30.0 +# (yao) issues with pdfminer-six above 20250416 +pdfminer.six<20250416 \ No newline at end of file diff --git a/requirements/dev.txt b/requirements/dev.txt index 4b489656fb..b42ff70e01 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -17,7 +17,7 @@ distlib==0.3.9 # via virtualenv filelock==3.18.0 # via virtualenv -identify==2.6.9 +identify==2.6.10 # via pre-commit importlib-metadata==8.6.1 # via @@ -25,7 +25,7 @@ importlib-metadata==8.6.1 # build nodeenv==1.9.1 # via pre-commit -packaging==24.2 +packaging==25.0 # via # -c ./base.txt # -c ./test.txt @@ -49,7 +49,7 @@ tomli==2.2.1 # -c ./test.txt # build # pip-tools -virtualenv==20.29.3 +virtualenv==20.30.0 # via pre-commit wheel==0.45.1 # via pip-tools diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index b6a9158f4f..f31b78b82a 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -4,13 +4,13 @@ # # pip-compile ./extra-docx.in # -lxml==5.3.1 +lxml==5.4.0 # via # -c ./base.txt # python-docx python-docx==1.1.2 # via -r ./extra-docx.in -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 9d0a14da55..2311bce60f 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -8,7 +8,7 @@ importlib-metadata==8.6.1 # via # -c ././deps/constraints.txt # markdown -markdown==3.7 +markdown==3.8 # via -r ./extra-markdown.in zipp==3.21.0 # via importlib-metadata diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index fa8e746301..ced65cd542 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-odt.in # -lxml==5.3.1 +lxml==5.4.0 # via # -c ./base.txt # python-docx @@ -12,7 +12,7 @@ pypandoc==1.15 # via -r ./extra-odt.in python-docx==1.1.2 # via -r ./extra-odt.in -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 84afee5161..df43fc8f9b 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -18,11 +18,11 @@ anyio==4.9.0 # httpx astor==0.8.1 # via paddlepaddle -beautifulsoup4==4.13.3 +beautifulsoup4==4.13.4 # via # -c ./base.txt # unstructured-paddleocr -certifi==2025.1.31 +certifi==2025.4.26 # via # -c ./base.txt # httpcore @@ -44,13 +44,13 @@ exceptiongroup==1.2.2 # anyio fire==0.7.0 # via unstructured-paddleocr -fonttools==4.56.0 +fonttools==4.57.0 # via unstructured-paddleocr -h11==0.14.0 +h11==0.16.0 # via # -c ./base.txt # httpcore -httpcore==1.0.7 +httpcore==1.0.9 # via # -c ./base.txt # httpx @@ -68,7 +68,7 @@ imageio==2.37.0 # via scikit-image lazy-loader==0.4 # via scikit-image -lxml==5.3.1 +lxml==5.4.0 # via # -c ./base.txt # python-docx @@ -102,14 +102,14 @@ opencv-python-headless==4.11.0.86 # albumentations opt-einsum==3.3.0 # via paddlepaddle -packaging==24.2 +packaging==25.0 # via # -c ./base.txt # lazy-loader # scikit-image paddlepaddle==3.0.0 # via -r ./extra-paddleocr.in -pillow==11.1.0 +pillow==11.2.1 # via # imageio # paddlepaddle @@ -121,9 +121,9 @@ protobuf==6.30.2 # paddlepaddle pyclipper==1.3.0.post6 # via unstructured-paddleocr -pydantic==2.10.6 +pydantic==2.11.3 # via albumentations -pydantic-core==2.27.2 +pydantic-core==2.33.1 # via pydantic python-docx==1.1.2 # via unstructured-paddleocr @@ -131,7 +131,7 @@ pyyaml==6.0.2 # via # albumentations # unstructured-paddleocr -rapidfuzz==3.12.2 +rapidfuzz==3.13.0 # via # -c ./base.txt # unstructured-paddleocr @@ -153,13 +153,13 @@ sniffio==1.3.1 # via # -c ./base.txt # anyio -soupsieve==2.6 +soupsieve==2.7 # via # -c ./base.txt # beautifulsoup4 -stringzilla==3.12.3 +stringzilla==3.12.5 # via albucore -termcolor==2.5.0 +termcolor==3.0.1 # via fire tifffile==2024.8.30 # via scikit-image @@ -167,7 +167,7 @@ tqdm==4.67.1 # via # -c ./base.txt # unstructured-paddleocr -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # albucore @@ -178,6 +178,9 @@ typing-extensions==4.13.0 # pydantic # pydantic-core # python-docx + # typing-inspection +typing-inspection==0.4.0 + # via pydantic unstructured-paddleocr==2.10.0 # via -r ./extra-paddleocr.in urllib3==1.26.20 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 061fb6de3b..367924c7d6 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -8,7 +8,7 @@ antlr4-python3-runtime==4.9.3 # via omegaconf cachetools==5.5.2 # via google-auth -certifi==2025.1.31 +certifi==2025.4.26 # via # -c ./base.txt # requests @@ -42,21 +42,21 @@ filelock==3.18.0 # transformers flatbuffers==25.2.10 # via onnxruntime -fonttools==4.56.0 +fonttools==4.57.0 # via matplotlib -fsspec==2025.3.0 +fsspec==2025.3.2 # via # huggingface-hub # torch google-api-core[grpc]==2.24.2 # via google-cloud-vision -google-auth==2.38.0 +google-auth==2.39.0 # via # google-api-core # google-cloud-vision google-cloud-vision==3.10.1 # via -r ./extra-pdf-image.in -googleapis-common-protos==1.69.2 +googleapis-common-protos==1.70.0 # via # google-api-core # grpcio-status @@ -67,7 +67,7 @@ grpcio==1.71.0 # grpcio-status grpcio-status==1.62.3 # via google-api-core -huggingface-hub==0.29.3 +huggingface-hub==0.30.2 # via # timm # tokenizers @@ -85,7 +85,7 @@ jinja2==3.1.6 # via torch kiwisolver==1.4.7 # via matplotlib -lxml==5.3.1 +lxml==5.4.0 # via # -c ./base.txt # pikepdf @@ -125,7 +125,7 @@ onnxruntime==1.19.2 # unstructured-inference opencv-python==4.11.0.86 # via unstructured-inference -packaging==24.2 +packaging==25.0 # via # -c ./base.txt # huggingface-hub @@ -138,15 +138,16 @@ pandas==2.2.3 # via unstructured-inference pdf2image==1.17.0 # via -r ./extra-pdf-image.in -pdfminer-six==20240706 +pdfminer-six==20250327 # via + # -c ././deps/constraints.txt # -r ./extra-pdf-image.in # unstructured-inference pi-heif==0.22.0 # via -r ./extra-pdf-image.in -pikepdf==9.5.2 +pikepdf==9.7.0 # via -r ./extra-pdf-image.in -pillow==11.1.0 +pillow==11.2.1 # via # matplotlib # pdf2image @@ -172,7 +173,7 @@ pyasn1==0.6.1 # via # pyasn1-modules # rsa -pyasn1-modules==0.4.1 +pyasn1-modules==0.4.2 # via google-auth pycocotools==2.0.8 # via effdet @@ -203,7 +204,7 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.12.2 +rapidfuzz==3.13.0 # via # -c ./base.txt # unstructured-inference @@ -217,7 +218,7 @@ requests==2.32.3 # google-api-core # huggingface-hub # transformers -rsa==4.9 +rsa==4.9.1 # via google-auth safetensors==0.5.3 # via @@ -229,7 +230,7 @@ six==1.17.0 # via # -c ./base.txt # python-dateutil -sympy==1.13.1 +sympy==1.13.3 # via # onnxruntime # torch @@ -241,13 +242,13 @@ tokenizers==0.21.1 # via # -c ././deps/constraints.txt # transformers -torch==2.6.0 +torch==2.7.0 # via # effdet # timm # torchvision # unstructured-inference -torchvision==0.21.0 +torchvision==0.22.0 # via # effdet # timm @@ -256,9 +257,9 @@ tqdm==4.67.1 # -c ./base.txt # huggingface-hub # transformers -transformers==4.50.1 +transformers==4.51.3 # via unstructured-inference -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # huggingface-hub diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 30e77d1ce7..7ec19718d8 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -4,13 +4,13 @@ # # pip-compile ./extra-pptx.in # -lxml==5.3.1 +lxml==5.4.0 # via python-pptx -pillow==11.1.0 +pillow==11.2.1 # via python-pptx python-pptx==1.0.2 # via -r ./extra-pptx.in -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via python-pptx -xlsxwriter==3.2.2 +xlsxwriter==3.2.3 # via python-pptx diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index f9e62f5266..a7c793c739 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -4,7 +4,7 @@ # # pip-compile ./huggingface.in # -certifi==2025.1.31 +certifi==2025.4.26 # via # -c ./base.txt # requests @@ -21,11 +21,11 @@ filelock==3.18.0 # huggingface-hub # torch # transformers -fsspec==2025.3.0 +fsspec==2025.3.2 # via # huggingface-hub # torch -huggingface-hub==0.29.3 +huggingface-hub==0.30.2 # via # tokenizers # transformers @@ -53,7 +53,7 @@ numpy==2.0.2 # via # -c ./base.txt # transformers -packaging==24.2 +packaging==25.0 # via # -c ./base.txt # huggingface-hub @@ -82,13 +82,13 @@ six==1.17.0 # via # -c ./base.txt # langdetect -sympy==1.13.1 +sympy==1.13.3 # via torch tokenizers==0.21.1 # via # -c ././deps/constraints.txt # transformers -torch==2.6.0 +torch==2.7.0 # via -r ./huggingface.in tqdm==4.67.1 # via @@ -96,9 +96,9 @@ tqdm==4.67.1 # huggingface-hub # sacremoses # transformers -transformers==4.50.1 +transformers==4.51.3 # via -r ./huggingface.in -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # huggingface-hub diff --git a/requirements/ingest/ingest.txt b/requirements/ingest/ingest.txt index 6c99d3cfcd..364f499029 100644 --- a/requirements/ingest/ingest.txt +++ b/requirements/ingest/ingest.txt @@ -1,4 +1,4 @@ -unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.1 +unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]>=0.2.1 s3fs>=2024.9.0 urllib3>=1.26.20 backoff>=2.2.1 diff --git a/requirements/test.txt b/requirements/test.txt index 1ebccc8953..2706ac725c 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -14,7 +14,7 @@ click==8.1.8 # via # -c ./base.txt # black -coverage[toml]==7.7.1 +coverage[toml]==7.8.0 # via # -r ./test.in # pytest-cov @@ -22,7 +22,7 @@ exceptiongroup==1.2.2 # via # -c ./base.txt # pytest -flake8==7.1.2 +flake8==7.2.0 # via # -r ./test.in # flake8-print @@ -42,12 +42,12 @@ mccabe==0.7.0 # via flake8 mypy==1.15.0 # via -r ./test.in -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 # via # -c ./base.txt # black # mypy -packaging==24.2 +packaging==25.0 # via # -c ./base.txt # black @@ -58,15 +58,15 @@ platformdirs==4.3.7 # via black pluggy==1.5.0 # via pytest -pycodestyle==2.12.1 +pycodestyle==2.13.0 # via # flake8 # flake8-print -pydantic==2.10.6 +pydantic==2.11.3 # via -r ./test.in -pydantic-core==2.27.2 +pydantic-core==2.33.1 # via pydantic -pyflakes==3.2.0 +pyflakes==3.3.2 # via # autoflake # flake8 @@ -74,7 +74,7 @@ pytest==8.3.5 # via # pytest-cov # pytest-mock -pytest-cov==6.0.0 +pytest-cov==6.1.1 # via -r ./test.in pytest-mock==3.14.0 # via -r ./test.in @@ -82,7 +82,7 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # freezegun -ruff==0.11.2 +ruff==0.11.7 # via -r ./test.in semantic-version==2.10.0 # via liccheck @@ -101,7 +101,7 @@ tomli==2.2.1 # pytest types-click==7.1.8 # via -r ./test.in -types-markdown==3.7.0.20250322 +types-markdown==3.8.0.20250415 # via -r ./test.in types-requests==2.31.0.6 # via -r ./test.in @@ -109,10 +109,13 @@ types-tabulate==0.9.0.20241207 # via -r ./test.in types-urllib3==1.26.25.14 # via types-requests -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # black # mypy # pydantic # pydantic-core + # typing-inspection +typing-inspection==0.4.0 + # via pydantic diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index 06e6a90097..24c362f451 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -200,7 +200,7 @@ { "type": "ListItem", "element_id": "36eb8f3c3778fbb71dc056571e71175d", - "text": "4. Team science and scientific communication: \u201csoft\u201d skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.", + "text": "4. Team science and scientific communication: “soft†skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -288,7 +288,7 @@ { "type": "NarrativeText", "element_id": "f250e86931949c66fe99d742fd9be29c", - "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM\u2019s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with", + "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with", "metadata": { "filetype": "application/pdf", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json index cca8a4dd1c..12255b00e7 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json @@ -178,7 +178,7 @@ { "type": "NarrativeText", "element_id": "0fb8eb24db1b27f6f8b69213e3dd9b41", - "text": "Long-term contracts. \u2014If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed.", + "text": "Long-term contracts. —If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -200,7 +200,7 @@ { "type": "NarrativeText", "element_id": "7282f497b067ed1e34176cc85d46ea8e", - "text": "Other methods.\u2014Unless the Service has published a regulation or procedure to the contrary, all other changes !n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of income attributable to the sale or furnishing of utility services no later than the year In which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes.", + "text": "Other methods.—Unless the Service has published a regulation or procedure to the contrary, all other changes !n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of income attributable to the sale or furnishing of utility services no later than the year In which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -310,7 +310,7 @@ { "type": "NarrativeText", "element_id": "9256e7591256b6799035172da259b839", - "text": "Uniform capitalization rules and limitation on cash method.\u2014If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\u201cAct\u201d), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required.", + "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Actâ€), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -332,7 +332,7 @@ { "type": "NarrativeText", "element_id": "9951e8eac8f909df08655f3bc100a586", - "text": "Disregard the instructions under Time and Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., \u201cAutomatic Change to Accrual Method\u2014Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information.", + "text": "Disregard the instructions under Time and Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., “Automatic Change to Accrual Method—Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -508,7 +508,7 @@ { "type": "NarrativeText", "element_id": "c92c7f4def0263141b370bf307d6bcc0", - "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of \u201cgood cause\u201d and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev, Proc. 79-63.", + "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of “good cause†and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev, Proc. 79-63.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -552,7 +552,7 @@ { "type": "NarrativeText", "element_id": "2932b94008de341f867fe6cfa1c95969", - "text": "Individuals.\u2014An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both.", + "text": "Individuals.—An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -574,7 +574,7 @@ { "type": "NarrativeText", "element_id": "4a9b9ec8ba60e739f49cfd240aa4439f", - "text": "Others.-\u2014The employer identification number of an applicant other than an individual should be entered in this block.", + "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -618,7 +618,7 @@ { "type": "NarrativeText", "element_id": "3c683355b205b83c4c0d3437e6cfa7e1", - "text": "Individuals. \u2014An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign.", + "text": "Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -640,7 +640,7 @@ { "type": "NarrativeText", "element_id": "e8cfd8f6db0442ba89ebad7a26a61fe9", - "text": "Partnerships.\u2014The form should be signed with the partnership name followed by the signature of one of the general partners and the words \u201cGeneral Partner.\u201d", + "text": "Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.â€", "metadata": { "filetype": "application/pdf", "languages": [ @@ -662,7 +662,7 @@ { "type": "NarrativeText", "element_id": "28ac207401b182955c7f456e4ed569e7", - "text": "Corporations, cooperatives, and insurance companies.\u2014The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation.", + "text": "Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -684,7 +684,7 @@ { "type": "NarrativeText", "element_id": "0bb2ae65d2e8e2d6deafb8a0b8ca959e", - "text": "Fiduciaries.\u2014The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title.", + "text": "Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -706,7 +706,7 @@ { "type": "NarrativeText", "element_id": "76d50bc1b5843d10ec33f0dd669e0158", - "text": "Preparer other than partner, officer, etc.\u2014The signature of the individual preparing the application should appear in the space provided on page 6.", + "text": "Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -838,7 +838,7 @@ { "type": "NarrativeText", "element_id": "43c45bb43eaf69131bf2392df1239ef2", - "text": "Item 5a, page 1.\u2014\u201cTaxable income or (loss) from operations\u201d is to be entered before application of any net operating loss deduction under section 172(a).", + "text": "Item 5a, page 1.—“Taxable income or (loss) from operations†is to be entered before application of any net operating loss deduction under section 172(a).", "metadata": { "filetype": "application/pdf", "languages": [ @@ -860,7 +860,7 @@ { "type": "NarrativeText", "element_id": "be68edc9cf1c170006855414e15dcb72", - "text": "Item 6, page 2.\u2014The term \u201cgross receipts\u201d includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you are a resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority.", + "text": "Item 6, page 2.—The term “gross receipts†includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you are a resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -882,7 +882,7 @@ { "type": "NarrativeText", "element_id": "31b52f9f7ca8d75190858bf0d55805db", - "text": "Item 7b, page 2.\u2014If item 7b 1s \u201cYes,\u201d indicate on a separate sheet the following for each separate trade or business: Nature of business", + "text": "Item 7b, page 2.—If item 7b 1s “Yes,†indicate on a separate sheet the following for each separate trade or business: Nature of business", "metadata": { "filetype": "application/pdf", "languages": [ @@ -926,7 +926,7 @@ { "type": "NarrativeText", "element_id": "28d8006c1f48ce2aec42391c8318fc8a", - "text": "Item 11, page 2.\u2014If you cannot provide the requested information, you may sign a statement under penalties of perjury that:", + "text": "Item 11, page 2.—If you cannot provide the requested information, you may sign a statement under penalties of perjury that:", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1036,7 +1036,7 @@ { "type": "NarrativeText", "element_id": "e5f591cf708bf2cae8df5018db1f3b1e", - "text": "Item 13, page 2.\u2014Insert the actual number of tax years. Use of the term \u201csince inception\u201d 1s not acceptable. However, \u201cmore than 6 years\u201d Is acceptable.", + "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception†1s not acceptable. However, “more than 6 years†Is acceptable.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1080,7 +1080,7 @@ { "type": "NarrativeText", "element_id": "be4be7fe105304e8063250e9e8933b50", - "text": "Item 1b, page 2.\u2014Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application.", + "text": "Item 1b, page 2.—Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1124,7 +1124,7 @@ { "type": "NarrativeText", "element_id": "ff41c26e5658894786749ca6449cff67", - "text": "Limitation on the Use of the Cash Method of Accounting. \u2014Except as provided below, C corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities.", + "text": "Limitation on the Use of the Cash Method of Accounting. —Except as provided below, C corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1146,7 +1146,7 @@ { "type": "NarrativeText", "element_id": "454de5bfbdcba4385a21dd6261c57d53", - "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to\u2014", + "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to—", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1168,7 +1168,7 @@ { "type": "NarrativeText", "element_id": "fc1f0d4d56acd27a18ba80ab0acfb9e9", - "text": "(1) Farming businesses.\u2014F or this purpose, the term \u201cfarming business\u201d 1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method.", + "text": "(1) Farming businesses.—F or this purpose, the term “farming business†1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1190,7 +1190,7 @@ { "type": "NarrativeText", "element_id": "51dcb59cd362d0003f609fdb43fbdfdc", - "text": "(2) Qualified personal service corporations. \u2014 A \u201cqualified personal service corporation\u201d is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law, engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)", + "text": "(2) Qualified personal service corporations. — A “qualified personal service corporation†is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law, engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1256,7 +1256,7 @@ { "type": "NarrativeText", "element_id": "5f5c402f9ebefef3ba8eabf1b5f628b2", - "text": "(3) Entities with gross receipts of $5,000,000 or less. \u2014To qualify for this exception, the C corporation's or partnership\u2019s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period.", + "text": "(3) Entities with gross receipts of $5,000,000 or less. —To qualify for this exception, the C corporation's or partnership’s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1344,7 +1344,7 @@ { "type": "NarrativeText", "element_id": "70c06cbb13920b0a14d56b49c3e596eb", - "text": "Inventories of retail merchants.\u2014The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8.", + "text": "Inventories of retail merchants.—The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1366,7 +1366,7 @@ { "type": "NarrativeText", "element_id": "73d59612ec830432b4de6df54516bd9c", - "text": "LIFO inventory changes.\u2014Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:", + "text": "LIFO inventory changes.—Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1498,7 +1498,7 @@ { "type": "NarrativeText", "element_id": "0ea0c5159902dffd24b032afc223d32a", - "text": "% U.S. Government Printing Office: 1987\u2014201-993/60166", + "text": "% U.S. Government Printing Office: 1987—201-993/60166", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1542,7 +1542,7 @@ { "type": "NarrativeText", "element_id": "b4575fdaff52c4def8f166ed0e2c4b39", - "text": "Section 460(f) provides that the term \u201clong-term contract\u201d means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete.", + "text": "Section 460(f) provides that the term “long-term contract†means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete.", "metadata": { "filetype": "application/pdf", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json index 8709788128..d361b431ef 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json @@ -178,7 +178,7 @@ { "type": "NarrativeText", "element_id": "4af565181db0676202636585f9abb438", - "text": "Long-term contracts. \u2014If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed.", + "text": "Long-term contracts. —If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed.", "metadata": { "filetype": "image/png", "languages": [ @@ -200,7 +200,7 @@ { "type": "NarrativeText", "element_id": "8dc3e4d18b3936db176790654f8823e1", - "text": "Other methods.\u2014Unless the Service has published a regulation or procedure to the contrary, all other changes 1n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the inclusion of income attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes.", + "text": "Other methods.—Unless the Service has published a regulation or procedure to the contrary, all other changes 1n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the inclusion of income attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes.", "metadata": { "filetype": "image/png", "languages": [ @@ -310,7 +310,7 @@ { "type": "NarrativeText", "element_id": "5b2139cd0640cd4eceddbce416a17f6f", - "text": "Uniform capitalization rules and limitation on cash method.\u2014If you are required to change your method of accounting under sectior,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\u201cAct\u201d), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required.", + "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under sectior,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Actâ€), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required.", "metadata": { "filetype": "image/png", "languages": [ @@ -332,7 +332,7 @@ { "type": "NarrativeText", "element_id": "525b9d3bf3ae575f8e86f62af6068ebd", - "text": "Disregard the instructions under Time and Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., \u201cAutomatic Change to Accrual Method Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information.", + "text": "Disregard the instructions under Time and Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., “Automatic Change to Accrual Method Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information.", "metadata": { "filetype": "image/png", "languages": [ @@ -508,7 +508,7 @@ { "type": "NarrativeText", "element_id": "53204b2c819131895da7dba7fe978047", - "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of \u201cgood cause\" and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev. Proc. 79-63.", + "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of “good cause\" and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev. Proc. 79-63.", "metadata": { "filetype": "image/png", "languages": [ @@ -552,7 +552,7 @@ { "type": "NarrativeText", "element_id": "a41365af6ab3185637e8f3891b27fcba", - "text": "Individuals.\u2014An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both.", + "text": "Individuals.—An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both.", "metadata": { "filetype": "image/png", "languages": [ @@ -574,7 +574,7 @@ { "type": "NarrativeText", "element_id": "803549fa9207cd4111ed9e5d7389a027", - "text": "Others.-\u2014The employer identification number of an applicant other than an individual should be entered in this block.", + "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block.", "metadata": { "filetype": "image/png", "languages": [ @@ -618,7 +618,7 @@ { "type": "NarrativeText", "element_id": "f49752a38f790a75872b43214d7b8e0c", - "text": "Individuals. \u2014An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign.", + "text": "Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign.", "metadata": { "filetype": "image/png", "languages": [ @@ -640,7 +640,7 @@ { "type": "NarrativeText", "element_id": "162bb7ebc5019059dc8341f5c44da7ec", - "text": "Partnerships.\u2014The form should be signed with the partnership name followed by the signature of one of the general partners and the words \u201cGeneral Partner.\u201d", + "text": "Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.â€", "metadata": { "filetype": "image/png", "languages": [ @@ -662,7 +662,7 @@ { "type": "NarrativeText", "element_id": "ba5311e456328d16efd5d2f5a8500388", - "text": "Corporations, cooperatives, and insurance companies.\u2014The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation.", + "text": "Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation.", "metadata": { "filetype": "image/png", "languages": [ @@ -684,7 +684,7 @@ { "type": "NarrativeText", "element_id": "6fe312aeeb0d718a776c177b27265353", - "text": "Fiduciaries.\u2014The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title.", + "text": "Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title.", "metadata": { "filetype": "image/png", "languages": [ @@ -706,7 +706,7 @@ { "type": "NarrativeText", "element_id": "152f56dcf3866eaa539ba72ac8d75fb9", - "text": "Preparer other than partner, officer, etc.\u2014The signature of the individual preparing the application should appear in the space provided on page 6.", + "text": "Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6.", "metadata": { "filetype": "image/png", "languages": [ @@ -838,7 +838,7 @@ { "type": "NarrativeText", "element_id": "ce36a381c0fb31df90d3d701b9b5ee2a", - "text": "Item 5a, page 1.\u2014\u201cTaxable income or (loss) from operations\u201d is to be entered before application of any net operating loss deduction under section 172(a).", + "text": "Item 5a, page 1.—“Taxable income or (loss) from operations†is to be entered before application of any net operating loss deduction under section 172(a).", "metadata": { "filetype": "image/png", "languages": [ @@ -860,7 +860,7 @@ { "type": "NarrativeText", "element_id": "f7876eba5d8a77571828d215aab6bf34", - "text": "Item 6, page 2.\u2014The term \u201cgross receipts\u201d includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you area resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legatly imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority.", + "text": "Item 6, page 2.—The term “gross receipts†includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you area resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legatly imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority.", "metadata": { "filetype": "image/png", "languages": [ @@ -882,7 +882,7 @@ { "type": "NarrativeText", "element_id": "baf5040c1ebd03c23f1210ec383970db", - "text": "Item 7b, page 2.\u2014If item 7b 1s \u201cYes,\u201d indicate ona separate sheet the following for each separate trade or business: Nature of business", + "text": "Item 7b, page 2.—If item 7b 1s “Yes,†indicate ona separate sheet the following for each separate trade or business: Nature of business", "metadata": { "filetype": "image/png", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json index 494e24e546..b891a7af79 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json +++ b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json @@ -779,7 +779,7 @@ { "type": "NarrativeText", "element_id": "c86708b570205221afc715f7f6a4ca3f", - "text": "

    News Around NOAA

    National Program

    Are You Weather-Ready for the Spring?

    Weather.gov > News Around NOAA > Are You Weather-Ready for the Spring?
    ", + "text": "

    News Around NOAA

    National Program

    Are You Weather-Ready for the Spring?

    Weather.gov > News Around NOAA > Are You Weather-Ready for the Spring?
    ", "metadata": { "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json index a3e498de8c..cc6ecebd11 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json @@ -2,7 +2,7 @@ { "type": "CompositeElement", "element_id": "85002882dd396da0b1b82c925b002be5", - "text": "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 \u2013 INTRODUCTION\n\nA. PURPOSE", + "text": "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 – INTRODUCTION\n\nA. PURPOSE", "metadata": { "data_source": { "record_locator": { @@ -56,7 +56,7 @@ { "type": "CompositeElement", "element_id": "1abe685eb8dfed0f2266d6cf793d7e6b", - "text": "le 11 of the United States Code. 28 U.S.C. \u00a7 586(b). The Handbook, issued as part of our duties under 28 U.S.C. \u00a7 586, establishes or clarifies the", + "text": "le 11 of the United States Code. 28 U.S.C. § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586, establishes or clarifies the", "metadata": { "data_source": { "record_locator": { @@ -152,7 +152,7 @@ { "type": "CompositeElement", "element_id": "b7d1b42646393ca0f41af0e8ec48f9a9", - "text": "relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11 U.S.C. \u00a7 321,", + "text": "relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11 U.S.C. § 321,", "metadata": { "data_source": { "record_locator": { @@ -176,7 +176,7 @@ { "type": "CompositeElement", "element_id": "9ee33f4141eca1f98ca4299d0fdfba31", - "text": "w. 11 U.S.C. \u00a7 321, 28 U.S.C. \u00a7 586, 28 C.F.R. \u00a7 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in this Handbook but", + "text": "w. 11 U.S.C. § 321, 28 U.S.C. § 586, 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in this Handbook but", "metadata": { "data_source": { "record_locator": { @@ -319,7 +319,7 @@ { "type": "CompositeElement", "element_id": "f4412be8c7b2624c729af85c85b3a0e4", - "text": "es in this Handbook refer to the Bankruptcy Code, 11 U.S.C. \u00a7 101 et seq., unless otherwise indicated.", + "text": "es in this Handbook refer to the Bankruptcy Code, 11 U.S.C. § 101 et seq., unless otherwise indicated.", "metadata": { "data_source": { "record_locator": { @@ -531,7 +531,7 @@ { "type": "CompositeElement", "element_id": "24e1076110b431b248b43b1fdaae5282", - "text": "apter 13 trustees./ This Handbook is issued under the authority of the Program\u2019s enabling statutes.", + "text": "apter 13 trustees./ This Handbook is issued under the authority of the Program’s enabling statutes.", "metadata": { "data_source": { "record_locator": { @@ -625,7 +625,7 @@ { "type": "CompositeElement", "element_id": "db297530e558410b89acd93c6b452b84", - "text": "perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28 U.S.C. \u00a7", + "text": "perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28 U.S.C. §", "metadata": { "data_source": { "record_locator": { @@ -649,7 +649,7 @@ { "type": "CompositeElement", "element_id": "201bfacc211f0eb640e2830b8c29ae41", - "text": "rustee. 28 U.S.C. \u00a7 586(b), 28 C.F.R. \u00a7 58.4(b) referencing 28 C.F.R. \u00a7 58.3(b).", + "text": "rustee. 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b).", "metadata": { "data_source": { "record_locator": { @@ -673,7 +673,7 @@ { "type": "CompositeElement", "element_id": "eff9d6f3a0cdb968b7715e2e417e12ea", - "text": "Although this Handbook is not intended to be a complete statutory reference, the standing trustee\u2019s primary statutory duties are set forth in 11", + "text": "Although this Handbook is not intended to be a complete statutory reference, the standing trustee’s primary statutory duties are set forth in 11", "metadata": { "data_source": { "record_locator": { @@ -696,7 +696,7 @@ { "type": "CompositeElement", "element_id": "fd4c45036e8f17c27271f75944389724", - "text": "are set forth in 11 U.S.C. \u00a7 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. \u00a7 704. These duties", + "text": "are set forth in 11 U.S.C. § 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. § 704. These duties", "metadata": { "data_source": { "record_locator": { @@ -720,7 +720,7 @@ { "type": "CompositeElement", "element_id": "a968d741409111b777fc123ef01f5407", - "text": "\u00a7 704. These duties include, but are not limited to, the following:", + "text": "§ 704. These duties include, but are not limited to, the following:", "metadata": { "data_source": { "record_locator": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json index b6516f791c..829b9b7a7e 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json @@ -2,7 +2,7 @@ { "type": "CompositeElement", "element_id": "06c85506db46c8d0e4f014e75146bcfc", - "text": "0 2 0 2\n\np e S 0 3\n\n] L C . s c [\n\n3 v 6 0 9 4 0 . 4 0 0 2 : v i X r a\n\nDense Passage Retrieval for Open-Domain Question Answering\n\nVladimir Karpukhin\u2217, Barlas O\u02d8guz\u2217, Sewon Min\u2020, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen\u2021, Wen-tau Yih\n\nFacebook AI\n\n\u2020University of Washington\n\n\u2021Princeton University\n\n{vladk, barlaso, plewis, ledell, edunov, scottyih}@fb.com sewon@cs.washington.edu danqic@cs.princeton.edu\n\nAbstract\n\nOpen-domain question answering relies on ef- \ufb01cient passage retrieval to select candidate contexts, where traditional sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can be practically implemented us- ing dense representations alone, where em- beddings are learned from a small number of questions and passages by a simple dual- encoder framework. When evaluated on a wide range of open-domain QA datasets, our dense retriever outperforms a strong Lucene- BM25 system greatly by 9%-19% absolute in terms of top-20 passage retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA benchmarks.1\n\n1", + "text": "0 2 0 2\n\np e S 0 3\n\n] L C . s c [\n\n3 v 6 0 9 4 0 . 4 0 0 2 : v i X r a\n\nDense Passage Retrieval for Open-Domain Question Answering\n\nVladimir Karpukhin∗, Barlas O˘guz∗, Sewon Min†, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen‡, Wen-tau Yih\n\nFacebook AI\n\n†University of Washington\n\n‡Princeton University\n\n{vladk, barlaso, plewis, ledell, edunov, scottyih}@fb.com sewon@cs.washington.edu danqic@cs.princeton.edu\n\nAbstract\n\nOpen-domain question answering relies on ef- ï¬cient passage retrieval to select candidate contexts, where traditional sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can be practically implemented us- ing dense representations alone, where em- beddings are learned from a small number of questions and passages by a simple dual- encoder framework. When evaluated on a wide range of open-domain QA datasets, our dense retriever outperforms a strong Lucene- BM25 system greatly by 9%-19% absolute in terms of top-20 passage retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA benchmarks.1\n\n1", "metadata": { "data_source": { "record_locator": { @@ -24,7 +24,7 @@ { "type": "CompositeElement", "element_id": "3ef998ac1d905d8ff1016f96a243295c", - "text": "Introduction\n\nOpen-domain question answering (QA) (Voorhees, 1999) is a task that answers factoid questions us- ing a large collection of documents. While early QA systems are often complicated and consist of multiple components (Ferrucci (2012); Moldovan et al. (2003), inter alia), the advances of reading comprehension models suggest a much simpli\ufb01ed two-stage framework: (1) a context retriever \ufb01rst selects a small subset of passages where some of them contain the answer to the question, and then (2) a machine reader can thoroughly exam- ine the retrieved contexts and identify the correct answer (Chen et al., 2017). Although reducing open-domain QA to machine reading is a very rea- sonable strategy, a huge performance degradation is often observed in practice2, indicating the needs of improving retrieval.\n\n\u2217Equal contribution 1The code and trained models have been released at\n\nhttps://github.com/facebookresearch/DPR.", + "text": "Introduction\n\nOpen-domain question answering (QA) (Voorhees, 1999) is a task that answers factoid questions us- ing a large collection of documents. While early QA systems are often complicated and consist of multiple components (Ferrucci (2012); Moldovan et al. (2003), inter alia), the advances of reading comprehension models suggest a much simpliï¬ed two-stage framework: (1) a context retriever ï¬rst selects a small subset of passages where some of them contain the answer to the question, and then (2) a machine reader can thoroughly exam- ine the retrieved contexts and identify the correct answer (Chen et al., 2017). Although reducing open-domain QA to machine reading is a very rea- sonable strategy, a huge performance degradation is often observed in practice2, indicating the needs of improving retrieval.\n\n∗Equal contribution 1The code and trained models have been released at\n\nhttps://github.com/facebookresearch/DPR.", "metadata": { "data_source": { "record_locator": { @@ -46,7 +46,7 @@ { "type": "CompositeElement", "element_id": "71b12f58c99f6097b17f4d5b6147201b", - "text": "2For instance, the exact match score on SQuAD v1.1 drops\n\nRetrieval in open-domain QA is usually imple- mented using TF-IDF or BM25 (Robertson and Zaragoza, 2009), which matches keywords ef\ufb01- ciently with an inverted index and can be seen as representing the question and context in high- dimensional, sparse vectors (with weighting). Con- versely, the dense, latent semantic encoding is com- plementary to sparse representations by design. For example, synonyms or paraphrases that consist of completely different tokens may still be mapped to vectors close to each other. Consider the question \u201cWho is the bad guy in lord of the rings?\u201d, which can be answered from the context \u201cSala Baker is best known for portraying the villain Sauron in the Lord of the Rings trilogy.\u201d A term-based system would have dif\ufb01culty retrieving such a context, while a dense retrieval system would be able to better match \u201cbad guy\u201d with \u201cvillain\u201d and fetch the cor- rect context. Dense encodings are also learnable by adjusting the embedding functions, which pro- vides additional \ufb02exibility to have a task-speci\ufb01c representation. With special in-memory data struc- tures and indexing schemes, retrieval can be done ef\ufb01ciently using maximum inner product search (MIPS) algorithms (e.g., Shrivastava and Li (2014); Guo et al. (2016)).\n\nHowever, it is generally believed that learn- ing a good dense vector representation needs a large number of labeled pairs of question and con- texts. Dense retrieval methods have thus never be shown to outperform TF-IDF/BM25 for open- domain QA before ORQA (Lee et al., 2019), which proposes a sophisticated inverse cloze task (ICT) objective, predicting the blocks that contain the masked sentence, for additional pretraining. The question encoder and the reader model are then \ufb01ne- tuned using pairs of questions and answers jointly. Although ORQA successfully demonstrates that dense retrieval can outperform BM25, setting new state-of-the-art results on multiple open-domain", + "text": "2For instance, the exact match score on SQuAD v1.1 drops\n\nRetrieval in open-domain QA is usually imple- mented using TF-IDF or BM25 (Robertson and Zaragoza, 2009), which matches keywords efï¬- ciently with an inverted index and can be seen as representing the question and context in high- dimensional, sparse vectors (with weighting). Con- versely, the dense, latent semantic encoding is com- plementary to sparse representations by design. For example, synonyms or paraphrases that consist of completely different tokens may still be mapped to vectors close to each other. Consider the question “Who is the bad guy in lord of the rings?â€, which can be answered from the context “Sala Baker is best known for portraying the villain Sauron in the Lord of the Rings trilogy.†A term-based system would have difï¬culty retrieving such a context, while a dense retrieval system would be able to better match “bad guy†with “villain†and fetch the cor- rect context. Dense encodings are also learnable by adjusting the embedding functions, which pro- vides additional flexibility to have a task-speciï¬c representation. With special in-memory data struc- tures and indexing schemes, retrieval can be done efï¬ciently using maximum inner product search (MIPS) algorithms (e.g., Shrivastava and Li (2014); Guo et al. (2016)).\n\nHowever, it is generally believed that learn- ing a good dense vector representation needs a large number of labeled pairs of question and con- texts. Dense retrieval methods have thus never be shown to outperform TF-IDF/BM25 for open- domain QA before ORQA (Lee et al., 2019), which proposes a sophisticated inverse cloze task (ICT) objective, predicting the blocks that contain the masked sentence, for additional pretraining. The question encoder and the reader model are then ï¬ne- tuned using pairs of questions and answers jointly. Although ORQA successfully demonstrates that dense retrieval can outperform BM25, setting new state-of-the-art results on multiple open-domain", "metadata": { "data_source": { "record_locator": { @@ -68,7 +68,7 @@ { "type": "CompositeElement", "element_id": "ef458b0b4659bfd57b11fbfb571c38d1", - "text": "from above 80% to less than 40% (Yang et al., 2019a).\n\nQA datasets, it also suffers from two weaknesses. First, ICT pretraining is computationally intensive and it is not completely clear that regular sentences are good surrogates of questions in the objective function. Second, because the context encoder is not \ufb01ne-tuned using pairs of questions and answers, the corresponding representations could be subop- timal.", + "text": "from above 80% to less than 40% (Yang et al., 2019a).\n\nQA datasets, it also suffers from two weaknesses. First, ICT pretraining is computationally intensive and it is not completely clear that regular sentences are good surrogates of questions in the objective function. Second, because the context encoder is not ï¬ne-tuned using pairs of questions and answers, the corresponding representations could be subop- timal.", "metadata": { "data_source": { "record_locator": { @@ -90,7 +90,7 @@ { "type": "CompositeElement", "element_id": "4204154eefaa843f79edc96dcc208054", - "text": "In this paper, we address the question: can we train a better dense embedding model using only pairs of questions and passages (or answers), with- out additional pretraining? By leveraging the now standard BERT pretrained model (Devlin et al., 2019) and a dual-encoder architecture (Bromley et al., 1994), we focus on developing the right training scheme using a relatively small number of question and passage pairs. Through a series of careful ablation studies, our \ufb01nal solution is surprisingly simple: the embedding is optimized for maximizing inner products of the question and relevant passage vectors, with an objective compar- ing all pairs of questions and passages in a batch. Our Dense Passage Retriever (DPR) is exception- ally strong. It not only outperforms BM25 by a large margin (65.2% vs. 42.9% in Top-5 accuracy), but also results in a substantial improvement on the end-to-end QA accuracy compared to ORQA (41.5% vs. 33.3%) in the open Natural Questions setting (Lee et al., 2019; Kwiatkowski et al., 2019). Our contributions are twofold. First, we demon- strate that with the proper training setup, sim- ply \ufb01ne-tuning the question and passage encoders on existing question-passage pairs is suf\ufb01cient to greatly outperform BM25. Our empirical results also suggest that additional pretraining may not be needed. Second, we verify that, in the context of open-domain question answering, a higher retrieval precision indeed translates to a higher end-to-end QA accuracy. By applying a modern reader model to the top retrieved passages, we achieve compara- ble or better results on multiple QA datasets in the open-retrieval setting, compared to several, much complicated systems.", + "text": "In this paper, we address the question: can we train a better dense embedding model using only pairs of questions and passages (or answers), with- out additional pretraining? By leveraging the now standard BERT pretrained model (Devlin et al., 2019) and a dual-encoder architecture (Bromley et al., 1994), we focus on developing the right training scheme using a relatively small number of question and passage pairs. Through a series of careful ablation studies, our ï¬nal solution is surprisingly simple: the embedding is optimized for maximizing inner products of the question and relevant passage vectors, with an objective compar- ing all pairs of questions and passages in a batch. Our Dense Passage Retriever (DPR) is exception- ally strong. It not only outperforms BM25 by a large margin (65.2% vs. 42.9% in Top-5 accuracy), but also results in a substantial improvement on the end-to-end QA accuracy compared to ORQA (41.5% vs. 33.3%) in the open Natural Questions setting (Lee et al., 2019; Kwiatkowski et al., 2019). Our contributions are twofold. First, we demon- strate that with the proper training setup, sim- ply ï¬ne-tuning the question and passage encoders on existing question-passage pairs is sufï¬cient to greatly outperform BM25. Our empirical results also suggest that additional pretraining may not be needed. Second, we verify that, in the context of open-domain question answering, a higher retrieval precision indeed translates to a higher end-to-end QA accuracy. By applying a modern reader model to the top retrieved passages, we achieve compara- ble or better results on multiple QA datasets in the open-retrieval setting, compared to several, much complicated systems.", "metadata": { "data_source": { "record_locator": { @@ -112,7 +112,7 @@ { "type": "CompositeElement", "element_id": "e6dee1abec28f8ff365ab6275b3e5f0e", - "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as \u201cWho \ufb01rst voiced Meg on Family Guy?\u201d or \u201cWhere was the 8th Dalai Lama born?\u201d, a system is required to answer it using a large corpus of diversi\ufb01ed topics. More speci\ufb01cally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,\u00b7\u00b7\u00b7 ,dD. We \ufb01rst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,\u00b7\u00b7\u00b7 ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to \ufb01nd a span w(i) s+1,\u00b7\u00b7\u00b7 ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an ef\ufb01cient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) \u2192 CF is a function that takes as input a question q and a corpus C and returns a much smaller \ufb01lter set of texts CF \u2282 C, where |CF| = k (cid:28) |C|. For a \ufb01xed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", + "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ï¬rst voiced Meg on Family Guy?†or “Where was the 8th Dalai Lama born?â€, a system is required to answer it using a large corpus of diversiï¬ed topics. More speciï¬cally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ï¬rst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ï¬nd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efï¬cient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ï¬lter set of texts CF â‚ C, where |CF| = k (cid:28) |C|. For a ï¬xed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", "metadata": { "data_source": { "record_locator": { @@ -134,7 +134,7 @@ { "type": "CompositeElement", "element_id": "ac6733a570cbdd5c8d48f8252b345b17", - "text": "3 Dense Passage Retriever (DPR)\n\nWe focus our research in this work on improv- ing the retrieval component in open-domain QA. Given a collection of M text passages, the goal of our dense passage retriever (DPR) is to index all the passages in a low-dimensional and continuous space, such that it can retrieve ef\ufb01ciently the top k passages relevant to the input question for the reader at run-time. Note that M can be very large (e.g., 21 million passages in our experiments, de- scribed in Section 4.1) and k is usually small, such as 20\u2013100.\n\n3.1 Overview\n\nOur dense passage retriever (DPR) uses a dense encoder EP(\u00b7) which maps any text passage to a d- dimensional real-valued vectors and builds an index for all the M passages that we will use for retrieval.\n\n3The ideal size and boundary of a text passage are func- tions of both the retriever and reader. We also experimented with natural paragraphs in our preliminary trials and found that using \ufb01xed-length passages performs better in both retrieval and \ufb01nal QA accuracy, as observed by Wang et al. (2019).\n\n4Exceptions include (Seo et al., 2019) and (Roberts et al., 2020), which retrieves and generates the answers, respectively.", + "text": "3 Dense Passage Retriever (DPR)\n\nWe focus our research in this work on improv- ing the retrieval component in open-domain QA. Given a collection of M text passages, the goal of our dense passage retriever (DPR) is to index all the passages in a low-dimensional and continuous space, such that it can retrieve efï¬ciently the top k passages relevant to the input question for the reader at run-time. Note that M can be very large (e.g., 21 million passages in our experiments, de- scribed in Section 4.1) and k is usually small, such as 20–100.\n\n3.1 Overview\n\nOur dense passage retriever (DPR) uses a dense encoder EP(·) which maps any text passage to a d- dimensional real-valued vectors and builds an index for all the M passages that we will use for retrieval.\n\n3The ideal size and boundary of a text passage are func- tions of both the retriever and reader. We also experimented with natural paragraphs in our preliminary trials and found that using ï¬xed-length passages performs better in both retrieval and ï¬nal QA accuracy, as observed by Wang et al. (2019).\n\n4Exceptions include (Seo et al., 2019) and (Roberts et al., 2020), which retrieves and generates the answers, respectively.", "metadata": { "data_source": { "record_locator": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json index 93a7b96213..27105cb789 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json @@ -74,7 +74,7 @@ { "type": "NarrativeText", "element_id": "c96f2c02e05225ffa09b7b93c303c323", - "text": " \u00a1\t\u00a2\t\u00a3\t\u00a4\t\u00a5\t\u00a6\t\u00a7\t\u00a8\t\u00a9\t\u00aa\t\u00ab\t\u00ac\tSHY\t\u00ae\t\u00af\n\u00b0\t\u00b1\t\u00b2\t\u00b3\t\u00b4\t\u00b5\t\u00b6\t\u00b7\t\u00b8\t\u00b9\t\u00ba\t\u00bb\t\u00bc\t\u00bd\t\u00be\t\u00bf\n\u00c0\t\u00c1\t\u00c2\t\u00c3\t\u00c4\t\u00c5\t\u00c6\t\u00c7\t\u00c8\t\u00c9\t\u00ca\t\u00cb\t\u00cc\t\u00cd\t\u00ce\t\u00cf\n\u00d0\t\u00d1\t\u00d2\t\u00d3\t\u00d4\t\u00d5\t\u00d6\t\u00d7\t\u00d8\t\u00d9\t\u00da\t\u00db\t\u00dc\t\u00dd\t\u00de\t\u00df\n\u00e0\t\u00e1\t\u00e2\t\u00e3\t\u00e4\t\u00e5\t\u00e6\t\u00e7\t\u00e8\t\u00e9\t\u00ea\t\u00eb\t\u00ec\t\u00ed\t\u00ee\t\u00ef\n\u00f0\t\u00f1\t\u00f2\t\u00f3\t\u00f4\t\u00f5\t\u00f6\t\u00f7\t\u00f8\t\u00f9\t\u00fa\t\u00fb\t\u00fc\t\u00fd\t\u00fe\t\u00ff", + "text": " ¡\t¢\t£\t¤\tÂ¥\t¦\t§\t¨\t©\tª\t«\t¬\tSHY\t®\t¯\n°\t±\t²\t³\t´\tµ\t¶\t·\t¸\t¹\tº\t»\t¼\t½\t¾\t¿\nĂ€\tĂ\tĂ‚\tĂƒ\tĂ„\tĂ…\tÆ\tÇ\tĂˆ\tÉ\tĂ\tĂ‹\tĂŒ\tĂ\tĂ\tĂ\nĂ\tĂ‘\tĂ’\tĂ“\tĂ”\tĂ•\tĂ–\tĂ—\tĂ˜\tĂ™\tĂ\tĂ›\tĂœ\tĂ\tĂ\tĂŸ\nĂ \tĂ¡\tĂ¢\tĂ£\tä\tĂ¥\tæ\tç\tè\tĂ©\tĂª\tĂ«\tì\tĂ­\tĂ®\tĂ¯\nð\tñ\tĂ²\tĂ³\tĂ´\tõ\tö\tĂ·\tø\tĂ¹\tĂº\tĂ»\tĂ¼\tĂ½\tĂ¾\tĂ¿", "metadata": { "languages": [ "por", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json index c71cf50967..b0354dcb4a 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json @@ -46,7 +46,7 @@ { "type": "Table", "element_id": "dddac446da6c93dc1449ecb5d997c423", - "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century \u2018TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents", + "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century ‘TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents", "metadata": { "text_as_html": "
    Dataset| Base Model!|Large Model| Notes
    PubLayNet [33]P/MMLayouts of modern scientific documents
    PRImA [3]MLayouts of scanned modern magazines and scientific reports
    Newspaper [17]PLayouts of scanned US newspapers from the 20th century
    TableBank [18]PTable region on modern scientific and business document
    HIDataset [31]P/MLayouts of history Japanese documents
    ", "filetype": "image/jpeg", @@ -69,7 +69,7 @@ { "type": "FigureCaption", "element_id": "a0c3c6b7e1e8c95016b989ef43c5ea2e", - "text": "2 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy vs. computational cost). For \u201cbase model\u201d and \u201clarge model\u201d, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (P) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has m Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months.", + "text": "2 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy vs. computational cost). For “base model†and “large modelâ€, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (P) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has m Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months.", "metadata": { "filetype": "image/jpeg", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 3f42ca335d..7c0e7324d2 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -244,7 +244,7 @@ { "type": "Title", "element_id": "d3be9e3d661e2a79f37257caa5b54d8c", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for Deep Learning Based Document Image Analysis", + "text": "LayoutParser: A Uniï¬ed Toolkit for Deep Learning Based Document Image Analysis", "metadata": { "filetype": "application/pdf", "languages": [ @@ -266,7 +266,7 @@ { "type": "NarrativeText", "element_id": "7cf062c1ba64938cc68c4fae61506d84", - "text": "Zejiang Shen! (4), Ruochen Zhang\u201d, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson\u2019, and Weining Li>", + "text": "Zejiang Shen! (4), Ruochen Zhangâ€, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson’, and Weining Li>", "metadata": { "filetype": "application/pdf", "languages": [ @@ -310,7 +310,7 @@ { "type": "NarrativeText", "element_id": "f1169388c7749db52e388e2fe4feaec6", - "text": "Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model con\ufb01gurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going e\ufb00orts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.", + "text": "Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model conï¬gurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.", "metadata": { "links": [ { @@ -339,7 +339,7 @@ { "type": "NarrativeText", "element_id": "caffc7480fdd82a089ae387e01aabdb9", - "text": "Keywords: Document Image Analysis \u00b7 Deep Learning \u00b7 Layout Analysis \u00b7 Character Recognition \u00b7 Open Source library \u00b7 Toolkit.", + "text": "Keywords: Document Image Analysis · Deep Learning · Layout Analysis · Character Recognition · Open Source library · Toolkit.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -383,7 +383,7 @@ { "type": "NarrativeText", "element_id": "8de96d1e80af35f9b6954252e14c2caf", - "text": "Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classi\ufb01cation [11,", + "text": "Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classiï¬cation [11,", "metadata": { "links": [ { @@ -434,7 +434,7 @@ { "type": "NarrativeText", "element_id": "4b097cc42d7d30e720512dbce0cb4905", - "text": "37], layout detection [38, 22], table detection [26], and scene text detection [4]. A generalized learning-based framework dramatically reduces the need for the manual speci\ufb01cation of complicated rules, which is the status quo with traditional methods. DL has the potential to transform DIA pipelines and bene\ufb01t a broad spectrum of large-scale document digitization projects.", + "text": "37], layout detection [38, 22], table detection [26], and scene text detection [4]. A generalized learning-based framework dramatically reduces the need for the manual speciï¬cation of complicated rules, which is the status quo with traditional methods. DL has the potential to transform DIA pipelines and beneï¬t a broad spectrum of large-scale document digitization projects.", "metadata": { "links": [ { @@ -483,7 +483,7 @@ { "type": "NarrativeText", "element_id": "45844a4901777afaf6de9a0994e017eb", - "text": "However, there are several practical di\ufb03culties for taking advantages of re- cent advances in DL-based methods: 1) DL models are notoriously convoluted for reuse and extension. Existing models are developed using distinct frame- works like TensorFlow [1] or PyTorch [24], and the high-level parameters can be obfuscated by implementation details [8]. It can be a time-consuming and frustrating experience to debug, reproduce, and adapt existing models for DIA, and many researchers who would bene\ufb01t the most from using these methods lack the technical background to implement them from scratch. 2) Document images contain diverse and disparate patterns across domains, and customized training is often required to achieve a desirable detection accuracy. Currently there is no full-\ufb02edged infrastructure for easily curating the target document image datasets and \ufb01ne-tuning or re-training the models. 3) DIA usually requires a sequence of models and other processing to obtain the \ufb01nal outputs. Often research teams use DL models and then perform further document analyses in separate processes, and these pipelines are not documented in any central location (and often not documented at all). This makes it di\ufb03cult for research teams to learn about how full pipelines are implemented and leads them to invest signi\ufb01cant resources in reinventing the DIA wheel.", + "text": "However, there are several practical difficulties for taking advantages of re- cent advances in DL-based methods: 1) DL models are notoriously convoluted for reuse and extension. Existing models are developed using distinct frame- works like TensorFlow [1] or PyTorch [24], and the high-level parameters can be obfuscated by implementation details [8]. It can be a time-consuming and frustrating experience to debug, reproduce, and adapt existing models for DIA, and many researchers who would beneï¬t the most from using these methods lack the technical background to implement them from scratch. 2) Document images contain diverse and disparate patterns across domains, and customized training is often required to achieve a desirable detection accuracy. Currently there is no full-fledged infrastructure for easily curating the target document image datasets and ï¬ne-tuning or re-training the models. 3) DIA usually requires a sequence of models and other processing to obtain the ï¬nal outputs. Often research teams use DL models and then perform further document analyses in separate processes, and these pipelines are not documented in any central location (and often not documented at all). This makes it difficult for research teams to learn about how full pipelines are implemented and leads them to invest signiï¬cant resources in reinventing the DIA wheel.", "metadata": { "links": [ { @@ -522,7 +522,7 @@ { "type": "NarrativeText", "element_id": "6f3c8d55dd5a4f95d8a59d146ca9ffa7", - "text": "LayoutParser provides a uni\ufb01ed toolkit to support DL-based document image analysis and processing. To address the aforementioned challenges, LayoutParser is built with the following components:", + "text": "LayoutParser provides a uniï¬ed toolkit to support DL-based document image analysis and processing. To address the aforementioned challenges, LayoutParser is built with the following components:", "metadata": { "filetype": "application/pdf", "languages": [ @@ -544,7 +544,7 @@ { "type": "ListItem", "element_id": "9ce12a49c1a9972b4cd2c3f66595b2b6", - "text": "1. An o\ufb00-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)", + "text": "1. An off-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -566,7 +566,7 @@ { "type": "ListItem", "element_id": "40f42a96bdd1559e09d74090c0fe9df3", - "text": "2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the o\ufb00-the-shelf usage", + "text": "2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage", "metadata": { "filetype": "application/pdf", "languages": [ @@ -588,7 +588,7 @@ { "type": "ListItem", "element_id": "0ca448d3ae0c4ee73bf46e8edfcd417d", - "text": "3. Comprehensive tools for e\ufb03cient document image data annotation and model tuning to support di\ufb00erent levels of customization", + "text": "3. Comprehensive tools for efficient document image data annotation and model tuning to support different levels of customization", "metadata": { "filetype": "application/pdf", "languages": [ @@ -632,7 +632,7 @@ { "type": "NarrativeText", "element_id": "8e216e91ff3471241858f1df445cdf0a", - "text": "The library implements simple and intuitive Python APIs without sacri\ufb01cing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will bene\ufb01t a variety of end-users, and will lead to advances in applications in both industry and academic research.", + "text": "The library implements simple and intuitive Python APIs without sacriï¬cing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will beneï¬t a variety of end-users, and will lead to advances in applications in both industry and academic research.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -654,7 +654,7 @@ { "type": "NarrativeText", "element_id": "583775f22c8080098beebbef960e2fbf", - "text": "LayoutParser is well aligned with recent e\ufb00orts for improving DL model reusability in other disciplines like natural language processing [8, 34] and com- puter vision [35], but with a focus on unique challenges in DIA. We show LayoutParser can be applied in sophisticated and large-scale digitization projects", + "text": "LayoutParser is well aligned with recent efforts for improving DL model reusability in other disciplines like natural language processing [8, 34] and com- puter vision [35], but with a focus on unique challenges in DIA. We show LayoutParser can be applied in sophisticated and large-scale digitization projects", "metadata": { "links": [ { @@ -693,7 +693,7 @@ { "type": "Header", "element_id": "f5a6697190c20bf6030d8e4ae8f6861a", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Uniï¬ed Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -715,7 +715,7 @@ { "type": "NarrativeText", "element_id": "50846086f4d9ece02052735686278699", - "text": "that require precision, e\ufb03ciency, and robustness, as well as simple and light- weight document processing tasks focusing on e\ufb03cacy and \ufb02exibility (Section 5). LayoutParser is being actively maintained, and support for more deep learning models and novel methods in text-based layout analysis methods [37, 34] is planned.", + "text": "that require precision, efficiency, and robustness, as well as simple and light- weight document processing tasks focusing on efficacy and flexibility (Section 5). LayoutParser is being actively maintained, and support for more deep learning models and novel methods in text-based layout analysis methods [37, 34] is planned.", "metadata": { "links": [ { @@ -825,7 +825,7 @@ { "type": "NarrativeText", "element_id": "8153390c1bb8652313be64034531449e", - "text": "Recently, various DL models and datasets have been developed for layout analysis tasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen- tation tasks on historical documents. Object detection-based methods like Faster R-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38] and detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also been used in table detection [27]. However, these models are usually implemented individually and there is no uni\ufb01ed framework to load and use such models.", + "text": "Recently, various DL models and datasets have been developed for layout analysis tasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen- tation tasks on historical documents. Object detection-based methods like Faster R-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38] and detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also been used in table detection [27]. However, these models are usually implemented individually and there is no uniï¬ed framework to load and use such models.", "metadata": { "links": [ { @@ -973,7 +973,7 @@ { "type": "NarrativeText", "element_id": "73feaff827cbc7089d3f95d1e5aac6aa", - "text": "Recent years have also seen numerous e\ufb00orts to create libraries for promoting reproducibility and reusability in the \ufb01eld of DL. Libraries like Dectectron2 [35],", + "text": "Recent years have also seen numerous efforts to create libraries for promoting reproducibility and reusability in the ï¬eld of DL. Libraries like Dectectron2 [35],", "metadata": { "links": [ { @@ -1002,7 +1002,7 @@ { "type": "Footer", "element_id": "b1fa4bbd1bdda08489faab5bf3adf5cc", - "text": "6 The number shown is obtained by specifying the search type as \u2018code\u2019.", + "text": "6 The number shown is obtained by specifying the search type as ‘code’.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1200,7 +1200,7 @@ { "type": "Image", "element_id": "642416e5d6c99219b16dbba6f72392c5", - "text": "Efficient Data Annotation Model Customization Document Images Community Platform \u2018a >) \u00a5 DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | \u2014\u2014= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY", + "text": "Efficient Data Annotation Model Customization Document Images Community Platform ‘a >) Â¥ DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | ——= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1222,7 +1222,7 @@ { "type": "NarrativeText", "element_id": "466f0bc21599ccf0fa27c021cb023f90", - "text": "Fig.1: The overall architecture of LayoutParser. For an input document image, the core LayoutParser library provides a set of o\ufb00-the-shelf tools for layout detection, OCR, visualization, and storage, backed by a carefully designed layout data structure. LayoutParser also supports high level customization via e\ufb03cient layout annotation and model training functions. These improve model accuracy on the target samples. The community platform enables the easy sharing of DIA models and whole digitization pipelines to promote reusability and reproducibility. A collection of detailed documentation, tutorials and exemplar projects make LayoutParser easy to learn and use.", + "text": "Fig.1: The overall architecture of LayoutParser. For an input document image, the core LayoutParser library provides a set of off-the-shelf tools for layout detection, OCR, visualization, and storage, backed by a carefully designed layout data structure. LayoutParser also supports high level customization via efficient layout annotation and model training functions. These improve model accuracy on the target samples. The community platform enables the easy sharing of DIA models and whole digitization pipelines to promote reusability and reproducibility. A collection of detailed documentation, tutorials and exemplar projects make LayoutParser easy to learn and use.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1244,7 +1244,7 @@ { "type": "NarrativeText", "element_id": "b4948db85ca791e99aa92589fc41734f", - "text": "AllenNLP [8] and transformers [34] have provided the community with complete DL-based support for developing and deploying models for general computer vision and natural language processing problems. LayoutParser, on the other hand, specializes speci\ufb01cally in DIA tasks. LayoutParser is also equipped with a community platform inspired by established model hubs such as Torch Hub [23] and TensorFlow Hub [1]. It enables the sharing of pretrained models as well as full document processing pipelines that are unique to DIA tasks.", + "text": "AllenNLP [8] and transformers [34] have provided the community with complete DL-based support for developing and deploying models for general computer vision and natural language processing problems. LayoutParser, on the other hand, specializes speciï¬cally in DIA tasks. LayoutParser is also equipped with a community platform inspired by established model hubs such as Torch Hub [23] and TensorFlow Hub [1]. It enables the sharing of pretrained models as well as full document processing pipelines that are unique to DIA tasks.", "metadata": { "links": [ { @@ -1288,7 +1288,7 @@ { "type": "NarrativeText", "element_id": "7651db80014a85ab253367d3bd3e4f88", - "text": "There have been a variety of document data collections to facilitate the development of DL models. Some examples include PRImA [3](magazine layouts), PubLayNet [38](academic paper layouts), Table Bank [18](tables in academic papers), Newspaper Navigator Dataset [16, 17](newspaper \ufb01gure layouts) and HJDataset [31](historical Japanese document layouts). A spectrum of models trained on these datasets are currently available in the LayoutParser model zoo to support di\ufb00erent use cases.", + "text": "There have been a variety of document data collections to facilitate the development of DL models. Some examples include PRImA [3](magazine layouts), PubLayNet [38](academic paper layouts), Table Bank [18](tables in academic papers), Newspaper Navigator Dataset [16, 17](newspaper ï¬gure layouts) and HJDataset [31](historical Japanese document layouts). A spectrum of models trained on these datasets are currently available in the LayoutParser model zoo to support different use cases.", "metadata": { "links": [ { @@ -1364,7 +1364,7 @@ { "type": "NarrativeText", "element_id": "47e45d28d96fc14ddc709835de35ece5", - "text": "At the core of LayoutParser is an o\ufb00-the-shelf toolkit that streamlines DL- based document image analysis. Five components support a simple interface with comprehensive functionalities: 1) The layout detection models enable using pre-trained or self-trained DL models for layout detection with just four lines of code. 2) The detected layout information is stored in carefully engineered", + "text": "At the core of LayoutParser is an off-the-shelf toolkit that streamlines DL- based document image analysis. Five components support a simple interface with comprehensive functionalities: 1) The layout detection models enable using pre-trained or self-trained DL models for layout detection with just four lines of code. 2) The detected layout information is stored in carefully engineered", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1386,7 +1386,7 @@ { "type": "ListItem", "element_id": "cd1112d2b15a0d27a29b1c83b2afd0dd", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Uniï¬ed Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1430,7 +1430,7 @@ { "type": "Table", "element_id": "cb534ba64da736dc53d60b660f5e1153", - "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] F / M M Layouts of modern scienti\ufb01c documents PRImA [3] M - Layouts of scanned modern magazines and scienti\ufb01c reports Newspaper [17] F - Layouts of scanned US newspapers from the 20th century TableBank [18] F F Table region on modern scienti\ufb01c and business document HJDataset [31] F / M - Layouts of history Japanese documents", + "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] F / M M Layouts of modern scientiï¬c documents PRImA [3] M - Layouts of scanned modern magazines and scientiï¬c reports Newspaper [17] F - Layouts of scanned US newspapers from the 20th century TableBank [18] F F Table region on modern scientiï¬c and business document HJDataset [31] F / M - Layouts of history Japanese documents", "metadata": { "links": [ { @@ -1480,7 +1480,7 @@ { "type": "FigureCaption", "element_id": "f978160527177fa39c13774ec8dfa9cb", - "text": "1 For each dataset, we train several models of di\ufb00erent sizes for di\ufb00erent needs (the trade-o\ufb00 between accuracy vs. computational cost). For \u201cbase model\u201d and \u201clarge model\u201d, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of di\ufb00erent architectures, like Faster R-CNN [28] (F) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months.", + "text": "1 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy vs. computational cost). For “base model†and “large modelâ€, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months.", "metadata": { "links": [ { @@ -1519,7 +1519,7 @@ { "type": "NarrativeText", "element_id": "55b33df7609960c3552a0b7bc1a5a9c6", - "text": "layout data structures, which are optimized for e\ufb03ciency and versatility. 3) When necessary, users can employ existing or customized OCR models via the uni\ufb01ed API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component.", + "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the uniï¬ed API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1563,7 +1563,7 @@ { "type": "NarrativeText", "element_id": "bbcc10c2b92de0cbdce8629f18b0d7ad", - "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Di\ufb00erent from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:", + "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:", "metadata": { "links": [ { @@ -1690,7 +1690,7 @@ { "type": "NarrativeText", "element_id": "f888c5e8f5b1339f2af75612ea13c719", - "text": "LayoutParser provides a wealth of pre-trained model weights using various datasets covering di\ufb00erent languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are signi\ufb01cantly di\ufb00erent from the training dataset. As document structures and layouts vary greatly in di\ufb00erent domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp:///.", + "text": "LayoutParser provides a wealth of pre-trained model weights using various datasets covering different languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are signiï¬cantly different from the training dataset. As document structures and layouts vary greatly in different domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp:///.", "metadata": { "links": [ { @@ -1763,7 +1763,7 @@ { "type": "Image", "element_id": "6eb2bb6ca50b3be177565f9ff546bce8", - "text": "- \u00b0 . 3 a a 4 a 3 oo er \u2018 2 \u00a7 8 a 8 3 3 \u2018 \u00a3 4 A g a 9 \u2018 3 \u00a5 Coordinate g 4 5 3 + \u00a7 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower \u00b0 & a \u00a2 o [ coordinatel textblock1, 3 3 \u2019 g Q 3 , textblock2 , layoutl ] 4 q \u00ae A list of the layout elements Ff", + "text": "- ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 Â¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1785,7 +1785,7 @@ { "type": "FigureCaption", "element_id": "9f11aa6b22dea1bba7eb0d122c0c5562", - "text": "Fig.2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum \ufb02exibility.", + "text": "Fig.2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum flexibility.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1807,7 +1807,7 @@ { "type": "NarrativeText", "element_id": "d997f63fd79c7e03050ca01b58dfdf0a", - "text": "Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained on 5 di\ufb00erent datasets. Description of the training dataset is provided alongside with the trained models such that users can quickly identify the most suitable models for their tasks. Additionally, when such a model is not readily available, LayoutParser also supports training customized layout models and community sharing of the models (detailed in Section 3.5).", + "text": "Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained on 5 different datasets. Description of the training dataset is provided alongside with the trained models such that users can quickly identify the most suitable models for their tasks. Additionally, when such a model is not readily available, LayoutParser also supports training customized layout models and community sharing of the models (detailed in Section 3.5).", "metadata": { "links": [ { @@ -1858,7 +1858,7 @@ { "type": "NarrativeText", "element_id": "601f7d95172984c75de081023ca64c15", - "text": "A critical feature of LayoutParser is the implementation of a series of data structures and operations that can be used to e\ufb03ciently process and manipulate the layout elements. In document image analysis pipelines, various post-processing on the layout analysis model outputs is usually required to obtain the \ufb01nal outputs. Traditionally, this requires exporting DL model outputs and then loading the results into other pipelines. All model outputs from LayoutParser will be stored in carefully engineered data types optimized for further processing, which makes it possible to build an end-to-end document digitization pipeline within LayoutParser. There are three key components in the data structure, namely the Coordinate system, the TextBlock, and the Layout. They provide di\ufb00erent levels of abstraction for the layout data, and a set of APIs are supported for transformations or operations on these classes.", + "text": "A critical feature of LayoutParser is the implementation of a series of data structures and operations that can be used to efficiently process and manipulate the layout elements. In document image analysis pipelines, various post-processing on the layout analysis model outputs is usually required to obtain the ï¬nal outputs. Traditionally, this requires exporting DL model outputs and then loading the results into other pipelines. All model outputs from LayoutParser will be stored in carefully engineered data types optimized for further processing, which makes it possible to build an end-to-end document digitization pipeline within LayoutParser. There are three key components in the data structure, namely the Coordinate system, the TextBlock, and the Layout. They provide different levels of abstraction for the layout data, and a set of APIs are supported for transformations or operations on these classes.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1880,7 +1880,7 @@ { "type": "ListItem", "element_id": "48d58ed9a3d95637df68c8b810147ba1", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Uniï¬ed Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1902,7 +1902,7 @@ { "type": "NarrativeText", "element_id": "dcdc0dc4759bd20c04026973cbe386e2", - "text": "Coordinates are the cornerstones for storing layout information. Currently, three types of Coordinate data structures are provided in LayoutParser, shown in Figure 2. Interval and Rectangle are the most common data types and support specifying 1D or 2D regions within a document. They are parameterized with 2 and 4 parameters. A Quadrilateral class is also implemented to support a more generalized representation of rectangular regions when the document is skewed or distorted, where the 4 corner points can be speci\ufb01ed and a total of 8 degrees of freedom are supported. A wide collection of transformations like shift, pad, and scale, and operations like intersect, union, and is_in, are supported for these classes. Notably, it is common to separate a segment of the image and analyze it individually. LayoutParser provides full support for this scenario via image cropping operations crop_image and coordinate transformations like relative_to and condition_on that transform coordinates to and from their relative representations. We refer readers to Table 2 for a more detailed description of these operations13.", + "text": "Coordinates are the cornerstones for storing layout information. Currently, three types of Coordinate data structures are provided in LayoutParser, shown in Figure 2. Interval and Rectangle are the most common data types and support specifying 1D or 2D regions within a document. They are parameterized with 2 and 4 parameters. A Quadrilateral class is also implemented to support a more generalized representation of rectangular regions when the document is skewed or distorted, where the 4 corner points can be speciï¬ed and a total of 8 degrees of freedom are supported. A wide collection of transformations like shift, pad, and scale, and operations like intersect, union, and is_in, are supported for these classes. Notably, it is common to separate a segment of the image and analyze it individually. LayoutParser provides full support for this scenario via image cropping operations crop_image and coordinate transformations like relative_to and condition_on that transform coordinates to and from their relative representations. We refer readers to Table 2 for a more detailed description of these operations13.", "metadata": { "links": [ { @@ -1941,7 +1941,7 @@ { "type": "NarrativeText", "element_id": "3f620e1ad95cd446170613ed9d780853", - "text": "Based on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent \ufb01eld to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment e\ufb00ort.", + "text": "Based on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent ï¬eld to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment effort.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1985,7 +1985,7 @@ { "type": "NarrativeText", "element_id": "16565416942e53cf65f75a8a845df211", - "text": "LayoutParser provides a uni\ufb01ed interface for existing OCR tools. Though there are many OCR tools available, they are usually con\ufb01gured di\ufb00erently with distinct APIs or protocols for using them. It can be ine\ufb03cient to add new OCR tools into an existing pipeline, and di\ufb03cult to make direct comparisons among the available tools to \ufb01nd the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it e\ufb00ortless to switch, evaluate, and compare di\ufb00erent OCR modules:", + "text": "LayoutParser provides a uniï¬ed interface for existing OCR tools. Though there are many OCR tools available, they are usually conï¬gured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to ï¬nd the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2051,7 +2051,7 @@ { "type": "NarrativeText", "element_id": "fa023ccf2ac1042ef254ecf47cc592ca", - "text": "LayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained with the Connectionist Temporal Classi\ufb01cation (CTC) loss [10]. It can be used like the other OCR modules, and can be easily trained on customized datasets.", + "text": "LayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained with the Connectionist Temporal Classiï¬cation (CTC) loss [10]. It can be used like the other OCR modules, and can be easily trained on customized datasets.", "metadata": { "links": [ { @@ -2129,7 +2129,7 @@ { "type": "NarrativeText", "element_id": "a5ce184b53898a543bca90a5b0acd156", - "text": "Table 2: All operations supported by the layout elements. The same APIs are supported across di\ufb00erent layout element classes including Coordinate types, TextBlock and Layout.", + "text": "Table 2: All operations supported by the layout elements. The same APIs are supported across different layout element classes including Coordinate types, TextBlock and Layout.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2151,9 +2151,9 @@ { "type": "Table", "element_id": "64bc79d1132a89c71837f420d6e4e2dc", - "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2\u2019s absolute coordinates block.crop image(image) Obtain the image segments in the block region", + "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block.crop image(image) Obtain the image segments in the block region", "metadata": { - "text_as_html": "
    block.pad(top, bottom,right,left)Enlarge the current block according to the input
    block.scale(fx, fy)Scale the current block given the ratio in x and y direction
    block.shift(dx, dy)Move the current block with the shift distances in x and y direction
    block1.is_in(block2)Whether block] is inside of block2
    block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
    block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates
    block. crop_image (image)Obtain the image segments in the block region
    ", + "text_as_html": "
    block.pad(top, bottom,right,left)Enlarge the current block according to the input
    block.scale(fx, fy)Scale the current block given the ratio in x and y direction
    block.shift(dx, dy)Move the current block with the shift distances in x and y direction
    block1.is_in(block2)Whether block] is inside of block2
    block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
    block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates
    block. crop_image (image)Obtain the image segments in the block region
    ", "filetype": "application/pdf", "languages": [ "eng" @@ -2196,7 +2196,7 @@ { "type": "NarrativeText", "element_id": "afa5f1dc8b4ce5598f278992d818eaa9", - "text": "The end goal of DIA is to transform the image-based document data into a structured database. LayoutParser supports exporting layout data into di\ufb00erent formats like JSON, csv, and will add the support for the METS/ALTO XML format 14 . It can also load datasets from layout analysis-speci\ufb01c formats like COCO [38] and the Page Format [25] for training layout models (Section 3.5).", + "text": "The end goal of DIA is to transform the image-based document data into a structured database. LayoutParser supports exporting layout data into different formats like JSON, csv, and will add the support for the METS/ALTO XML format 14 . It can also load datasets from layout analysis-speciï¬c formats like COCO [38] and the Page Format [25] for training layout models (Section 3.5).", "metadata": { "links": [ { @@ -2240,7 +2240,7 @@ { "type": "NarrativeText", "element_id": "28aeb996f497c9d01d06e564483d0854", - "text": "Visualization of the layout detection results is critical for both presentation and debugging. LayoutParser is built with an integrated API for displaying the layout information along with the original document image. Shown in Figure 3, it enables presenting layout data with rich meta information and features in di\ufb00erent modes. More detailed information can be found in the online LayoutParser documentation page.", + "text": "Visualization of the layout detection results is critical for both presentation and debugging. LayoutParser is built with an integrated API for displaying the layout information along with the original document image. Shown in Figure 3, it enables presenting layout data with rich meta information and features in different modes. More detailed information can be found in the online LayoutParser documentation page.", "metadata": { "links": [ { @@ -2291,7 +2291,7 @@ { "type": "NarrativeText", "element_id": "05e5f4e2a196db34263541d1ecebe297", - "text": "Besides the o\ufb00-the-shelf library, LayoutParser is also highly customizable with supports for highly unique and challenging document analysis tasks. Target document images can be vastly di\ufb00erent from the existing datasets for train- ing layout models, which leads to low layout detection accuracy. Training data", + "text": "Besides the off-the-shelf library, LayoutParser is also highly customizable with supports for highly unique and challenging document analysis tasks. Target document images can be vastly different from the existing datasets for train- ing layout models, which leads to low layout detection accuracy. Training data", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2335,7 +2335,7 @@ { "type": "ListItem", "element_id": "c069937e6c2bfc0f856835f3af4d6181", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Uniï¬ed Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2379,7 +2379,7 @@ { "type": "NarrativeText", "element_id": "4d1b9566e792683b9559b778be4f4046", - "text": "Fig.3: Layout detection and OCR results visualization generated by the LayoutParser APIs. Mode I directly overlays the layout region bounding boxes and categories over the original image. Mode II recreates the original document via drawing the OCR\u2019d texts at their corresponding positions on the image canvas. In this \ufb01gure, tokens in textual regions are \ufb01ltered using the API and then displayed.", + "text": "Fig.3: Layout detection and OCR results visualization generated by the LayoutParser APIs. Mode I directly overlays the layout region bounding boxes and categories over the original image. Mode II recreates the original document via drawing the OCR’d texts at their corresponding positions on the image canvas. In this ï¬gure, tokens in textual regions are ï¬ltered using the API and then displayed.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2401,7 +2401,7 @@ { "type": "NarrativeText", "element_id": "625c9e1d41a9740f094041595f79953d", - "text": "can also be highly sensitive and not sharable publicly. To overcome these chal- lenges, LayoutParser is built with rich features for e\ufb03cient data annotation and customized model training.", + "text": "can also be highly sensitive and not sharable publicly. To overcome these chal- lenges, LayoutParser is built with rich features for efficient data annotation and customized model training.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2423,7 +2423,7 @@ { "type": "NarrativeText", "element_id": "a3498730b5cd3fe9405fad69bcf37882", - "text": "LayoutParser incorporates a toolkit optimized for annotating document lay- outs using object-level active learning [32]. With the help from a layout detection model trained along with labeling, only the most important layout objects within each image, rather than the whole image, are required for labeling. The rest of the regions are automatically annotated with high con\ufb01dence predictions from the layout detection model. This allows a layout dataset to be created more e\ufb03ciently with only around 60% of the labeling budget.", + "text": "LayoutParser incorporates a toolkit optimized for annotating document lay- outs using object-level active learning [32]. With the help from a layout detection model trained along with labeling, only the most important layout objects within each image, rather than the whole image, are required for labeling. The rest of the regions are automatically annotated with high conï¬dence predictions from the layout detection model. This allows a layout dataset to be created more efficiently with only around 60% of the labeling budget.", "metadata": { "links": [ { @@ -2452,7 +2452,7 @@ { "type": "NarrativeText", "element_id": "c4ccf2cf2e7495668221cbe51534f90b", - "text": "After the training dataset is curated, LayoutParser supports di\ufb00erent modes for training the layout models. Fine-tuning can be used for training models on a small newly-labeled dataset by initializing the model with existing pre-trained weights. Training from scratch can be helpful when the source dataset and target are signi\ufb01cantly di\ufb00erent and a large training set is available. However, as suggested in Studer et al.\u2019s work[33], loading pre-trained weights on large-scale datasets like ImageNet [5], even from totally di\ufb00erent domains, can still boost model performance. Through the integrated API provided by LayoutParser, users can easily compare model performances on the benchmark datasets.", + "text": "After the training dataset is curated, LayoutParser supports different modes for training the layout models. Fine-tuning can be used for training models on a small newly-labeled dataset by initializing the model with existing pre-trained weights. Training from scratch can be helpful when the source dataset and target are signiï¬cantly different and a large training set is available. However, as suggested in Studer et al.’s work[33], loading pre-trained weights on large-scale datasets like ImageNet [5], even from totally different domains, can still boost model performance. Through the integrated API provided by LayoutParser, users can easily compare model performances on the benchmark datasets.", "metadata": { "links": [ { @@ -2669,7 +2669,7 @@ { "type": "ListItem", "element_id": "ab543398222da25b3a9231929162d3a0", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Uniï¬ed Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2691,7 +2691,7 @@ { "type": "NarrativeText", "element_id": "4b9eddb71426681f2828832312457b67", - "text": "focuses on precision, e\ufb03ciency, and robustness. The target documents may have complicated structures, and may require training multiple layout detection models to achieve the optimal accuracy. Light-weight pipelines are built for relatively simple documents, with an emphasis on development ease, speed and \ufb02exibility. Ideally one only needs to use existing resources, and model training should be avoided. Through two exemplar projects, we show how practitioners in both academia and industry can easily build such pipelines using LayoutParser and extract high-quality structured document data for their downstream tasks. The source code for these projects will be publicly available in the LayoutParser community hub.", + "text": "focuses on precision, efficiency, and robustness. The target documents may have complicated structures, and may require training multiple layout detection models to achieve the optimal accuracy. Light-weight pipelines are built for relatively simple documents, with an emphasis on development ease, speed and flexibility. Ideally one only needs to use existing resources, and model training should be avoided. Through two exemplar projects, we show how practitioners in both academia and industry can easily build such pipelines using LayoutParser and extract high-quality structured document data for their downstream tasks. The source code for these projects will be publicly available in the LayoutParser community hub.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2757,7 +2757,7 @@ { "type": "NarrativeText", "element_id": "76dd07abeb9f4bbcb77152deb52c9dc0", - "text": "In this example, LayoutParser was used to develop a comprehensive pipeline, shown in Figure 5, to gener- ate high-quality structured data from historical Japanese \ufb01rm \ufb01nancial ta- bles with complicated layouts. The pipeline applies two layout models to identify di\ufb00erent levels of document structures and two customized OCR engines for optimized character recog- nition accuracy.", + "text": "In this example, LayoutParser was used to develop a comprehensive pipeline, shown in Figure 5, to gener- ate high-quality structured data from historical Japanese ï¬rm ï¬nancial ta- bles with complicated layouts. The pipeline applies two layout models to identify different levels of document structures and two customized OCR engines for optimized character recog- nition accuracy.", "metadata": { "links": [ { @@ -2786,7 +2786,7 @@ { "type": "NarrativeText", "element_id": "42551c9b40827dcdc52055b4d25c6fc3", - "text": "As shown in Figure 4 (a), the document contains columns of text written vertically 15, a common style in Japanese. Due to scanning noise and archaic printing technology, the columns can be skewed or have vari- able widths, and hence cannot be eas- ily identi\ufb01ed via rule-based methods. Within each column, words are sepa- rated by white spaces of variable size, and the vertical positions of objects can be an indicator of their layout type.", + "text": "As shown in Figure 4 (a), the document contains columns of text written vertically 15, a common style in Japanese. Due to scanning noise and archaic printing technology, the columns can be skewed or have vari- able widths, and hence cannot be eas- ily identiï¬ed via rule-based methods. Within each column, words are sepa- rated by white spaces of variable size, and the vertical positions of objects can be an indicator of their layout type.", "metadata": { "links": [ { @@ -2820,7 +2820,7 @@ { "type": "Image", "element_id": "f48a844114951222f6c96331efc683fb", - "text": "(spe peepee, \u2018Active Learning Layout Annotate Layout Dataset | + \u2018Annotation Toolkit \u00a5 a Deep Leaming Layout Model Training & Inference, \u00a5 ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <\u2014\u2014 Default ane Customized \u00a5 ee Layout Structure Visualization & Export | <\u2014\u2014 | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules", + "text": "(spe peepee, ‘Active Learning Layout Annotate Layout Dataset | + ‘Annotation Toolkit Â¥ a Deep Leaming Layout Model Training & Inference, Â¥ ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <—— Default ane Customized Â¥ ee Layout Structure Visualization & Export | <—— | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2930,7 +2930,7 @@ { "type": "NarrativeText", "element_id": "7e1f7b138c864ed8b40cf0f3d38801ec", - "text": "structure, two object detection models have been trained to recognize individual columns and tokens, respectively. A small training set (400 images with approxi- mately 100 annotations each) is curated via the active learning based annotation tool [32] in LayoutParser. The models learn to identify both the categories and regions for each token or column via their distinct visual features. The layout data structure enables easy grouping of the tokens within each column, and rearranging columns to achieve the correct reading orders based on the horizontal position. Errors are identi\ufb01ed and recti\ufb01ed via checking the consistency of the model predictions. Therefore, though trained on a small dataset, the pipeline achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19] score across 5 categories for the column detection model, and a 89.23 AP across 4 categories for the token detection model.", + "text": "structure, two object detection models have been trained to recognize individual columns and tokens, respectively. A small training set (400 images with approxi- mately 100 annotations each) is curated via the active learning based annotation tool [32] in LayoutParser. The models learn to identify both the categories and regions for each token or column via their distinct visual features. The layout data structure enables easy grouping of the tokens within each column, and rearranging columns to achieve the correct reading orders based on the horizontal position. Errors are identiï¬ed and rectiï¬ed via checking the consistency of the model predictions. Therefore, though trained on a small dataset, the pipeline achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19] score across 5 categories for the column detection model, and a 89.23 AP across 4 categories for the token detection model.", "metadata": { "links": [ { @@ -2964,7 +2964,7 @@ { "type": "NarrativeText", "element_id": "dccaa93e7bae24dedf523dd39575dfbe", - "text": "A combination of character recognition methods is developed to tackle the unique challenges in this document. In our experiments, we found that irregular spacing between the tokens led to a low character recognition recall rate, whereas existing OCR models tend to perform better on densely-arranged texts. To overcome this challenge, we create a document reorganization algorithm that rearranges the text based on the token bounding boxes detected in the layout analysis step. Figure 4 (b) illustrates the generated image of dense text, which is sent to the OCR APIs as a whole to reduce the transaction costs. The \ufb02exible coordinate system in LayoutParser is used to transform the OCR results relative to their original positions on the page.", + "text": "A combination of character recognition methods is developed to tackle the unique challenges in this document. In our experiments, we found that irregular spacing between the tokens led to a low character recognition recall rate, whereas existing OCR models tend to perform better on densely-arranged texts. To overcome this challenge, we create a document reorganization algorithm that rearranges the text based on the token bounding boxes detected in the layout analysis step. Figure 4 (b) illustrates the generated image of dense text, which is sent to the OCR APIs as a whole to reduce the transaction costs. The flexible coordinate system in LayoutParser is used to transform the OCR results relative to their original positions on the page.", "metadata": { "links": [ { @@ -2993,7 +2993,7 @@ { "type": "NarrativeText", "element_id": "60c2e2147d0b0dbd576d51b71a95a2ef", - "text": "Additionally, it is common for historical documents to use unique fonts with di\ufb00erent glyphs, which signi\ufb01cantly degrades the accuracy of OCR models trained on modern texts. In this document, a special \ufb02at font is used for printing numbers and could not be detected by o\ufb00-the-shelf OCR engines. Using the highly \ufb02exible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal e\ufb00ort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identi\ufb01es characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set.", + "text": "Additionally, it is common for historical documents to use unique fonts with different glyphs, which signiï¬cantly degrades the accuracy of OCR models trained on modern texts. In this document, a special flat font is used for printing numbers and could not be detected by off-the-shelf OCR engines. Using the highly flexible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal effort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identiï¬es characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set.", "metadata": { "links": [ { @@ -3032,7 +3032,7 @@ { "type": "NarrativeText", "element_id": "de9e855638523c5f77ed4070813e37a3", - "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate \ufb01ne-grained results that enable creative approaches like page reorganization for OCR.", + "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate ï¬ne-grained results that enable creative approaches like page reorganization for OCR.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3098,7 +3098,7 @@ { "type": "ListItem", "element_id": "2b7101f39954d5301166b82906202ea9", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Uniï¬ed Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3142,7 +3142,7 @@ { "type": "FigureCaption", "element_id": "d35d253341e8b8d837f384ecd6ac410a", - "text": "Fig.6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in di\ufb00erent locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.", + "text": "Fig.6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in different locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3186,7 +3186,7 @@ { "type": "NarrativeText", "element_id": "445ad333fa3f7f85d2be634fbdeeb72a", - "text": "Detecting tables and parsing their structures (table extraction) are of central im- portance for many document digitization tasks. Many previous works [26, 30, 27] and tools 18 have been developed to identify and parse table structures. Yet they might require training complicated models from scratch, or are only applicable for born-digital PDF documents. In this section, we show how LayoutParser can help build a light-weight accurate visual table extractor for legal docket tables using the existing resources with minimal e\ufb00ort.", + "text": "Detecting tables and parsing their structures (table extraction) are of central im- portance for many document digitization tasks. Many previous works [26, 30, 27] and tools 18 have been developed to identify and parse table structures. Yet they might require training complicated models from scratch, or are only applicable for born-digital PDF documents. In this section, we show how LayoutParser can help build a light-weight accurate visual table extractor for legal docket tables using the existing resources with minimal effort.", "metadata": { "links": [ { @@ -3230,7 +3230,7 @@ { "type": "NarrativeText", "element_id": "923b62eb8550ec49cf6d3f2e6bac7ec8", - "text": "The extractor uses a pre-trained layout detection model for identifying the table regions and some simple rules for pairing the rows and the columns in the PDF image. Mask R-CNN [12] trained on the PubLayNet dataset [38] from the LayoutParser Model Zoo can be used for detecting table regions. By \ufb01ltering out model predictions of low con\ufb01dence and removing overlapping predictions, LayoutParser can identify the tabular regions on each page, which signi\ufb01cantly simpli\ufb01es the subsequent steps. By applying the line detection functions within the tabular segments, provided in the utility module from LayoutParser, the pipeline can identify the three distinct columns in the tables. A row clustering method is then applied via analyzing the y coordinates of token bounding boxes in the left-most column, which are obtained from the OCR engines. A non-maximal suppression algorithm is used to remove duplicated rows with extremely small gaps. Shown in Figure 6, the built pipeline can detect tables at di\ufb00erent positions on a page accurately. Continued tables from di\ufb00erent pages are concatenated, and a structured table representation has been easily created.", + "text": "The extractor uses a pre-trained layout detection model for identifying the table regions and some simple rules for pairing the rows and the columns in the PDF image. Mask R-CNN [12] trained on the PubLayNet dataset [38] from the LayoutParser Model Zoo can be used for detecting table regions. By ï¬ltering out model predictions of low conï¬dence and removing overlapping predictions, LayoutParser can identify the tabular regions on each page, which signiï¬cantly simpliï¬es the subsequent steps. By applying the line detection functions within the tabular segments, provided in the utility module from LayoutParser, the pipeline can identify the three distinct columns in the tables. A row clustering method is then applied via analyzing the y coordinates of token bounding boxes in the left-most column, which are obtained from the OCR engines. A non-maximal suppression algorithm is used to remove duplicated rows with extremely small gaps. Shown in Figure 6, the built pipeline can detect tables at different positions on a page accurately. Continued tables from different pages are concatenated, and a structured table representation has been easily created.", "metadata": { "links": [ { @@ -3335,7 +3335,7 @@ { "type": "NarrativeText", "element_id": "e79cef57c86050aa5fc74e5cd3923197", - "text": "LayoutParser provides a comprehensive toolkit for deep learning-based document image analysis. The o\ufb00-the-shelf library is easy to install, and can be used to build \ufb02exible and accurate pipelines for processing documents with complicated structures. It also supports high-level customization and enables easy labeling and training of DL models on unique document image datasets. The LayoutParser community platform facilitates sharing DL models and DIA pipelines, inviting discussion and promoting code reproducibility and reusability. The LayoutParser team is committed to keeping the library updated continuously and bringing the most recent advances in DL-based DIA, such as multi-modal document modeling [37, 36, 9] (an upcoming priority), to a diverse audience of end-users.", + "text": "LayoutParser provides a comprehensive toolkit for deep learning-based document image analysis. The off-the-shelf library is easy to install, and can be used to build flexible and accurate pipelines for processing documents with complicated structures. It also supports high-level customization and enables easy labeling and training of DL models on unique document image datasets. The LayoutParser community platform facilitates sharing DL models and DIA pipelines, inviting discussion and promoting code reproducibility and reusability. The LayoutParser team is committed to keeping the library updated continuously and bringing the most recent advances in DL-based DIA, such as multi-modal document modeling [37, 36, 9] (an upcoming priority), to a diverse audience of end-users.", "metadata": { "links": [ { @@ -3418,7 +3418,7 @@ { "type": "ListItem", "element_id": "85e09a5617e58a3a78b22fd12eb29eaf", - "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man\u00b4e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi\u00b4egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), https://www.tensorflow.org/, software available from tensor\ufb02ow.org", + "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man´e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi´egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), https://www.tensorflow.org/, software available from tensorflow.org", "metadata": { "links": [ { @@ -3447,7 +3447,7 @@ { "type": "ListItem", "element_id": "ad466edc2a12c9be4bf951fd8b5bf818", - "text": "[2] Alberti, M., Pondenkandath, V., W\u00a8ursch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423\u2013428. IEEE (2018)", + "text": "[2] Alberti, M., Pondenkandath, V., W¨ursch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423–428. IEEE (2018)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3469,7 +3469,7 @@ { "type": "ListItem", "element_id": "217777f3d44620afddc1e27553e81a66", - "text": "[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296\u2013300. IEEE (2009)", + "text": "[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296–300. IEEE (2009)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3491,7 +3491,7 @@ { "type": "ListItem", "element_id": "292dd088dc6a174159395e31be7755d7", - "text": "[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365\u20139374 (2019)", + "text": "[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365–9374 (2019)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3535,7 +3535,7 @@ { "type": "ListItem", "element_id": "4e93c51c89970349aa9e0a42cb330c4b", - "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-\ufb01ne attention. In: International Conference on Machine Learning. pp. 980\u2013989. PMLR (2017)", + "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-ï¬ne attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3557,7 +3557,7 @@ { "type": "ListItem", "element_id": "8cfd166d282469f765423faae44271e2", - "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180\u20131189. PMLR (2015)", + "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3579,7 +3579,7 @@ { "type": "ListItem", "element_id": "8bce49aab693aad97676011688f3f6f3", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Uniï¬ed Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3645,7 +3645,7 @@ { "type": "ListItem", "element_id": "95bc71fb3542f420dfa50e22eb8c734f", - "text": "[10] Graves, A., Fern\u00b4andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classi\ufb01cation: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369\u2013376 (2006)", + "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classiï¬cation: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3667,7 +3667,7 @@ { "type": "ListItem", "element_id": "3fab75481d8e6d389ea6034e18f54e00", - "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classi\ufb01cation and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991\u2013995. IEEE (2015)", + "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classiï¬cation and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3689,7 +3689,7 @@ { "type": "ListItem", "element_id": "8cd8821b71e4bda1a77f6a114ff54f50", - "text": "[12] He, K., Gkioxari, G., Doll\u00b4ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision. pp. 2961\u20132969 (2017)", + "text": "[12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision. pp. 2961–2969 (2017)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3711,7 +3711,7 @@ { "type": "ListItem", "element_id": "02c0a0c6c60503798f3894fe244c237d", - "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770\u2013778 (2016)", + "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3755,7 +3755,7 @@ { "type": "ListItem", "element_id": "bd2e9f3795d8492cadde716193f62aba", - "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42\u201347. IEEE (2011)", + "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3777,7 +3777,7 @@ { "type": "ListItem", "element_id": "07cef8a161dd1c3f0895c605844d678e", - "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120\u2013122. UIST \u201920 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143", + "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143", "metadata": { "links": [ { @@ -3816,7 +3816,7 @@ { "type": "ListItem", "element_id": "90ad04faa055039bfd37c1a851878048", - "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055\u20133062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767", + "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3860,7 +3860,7 @@ { "type": "ListItem", "element_id": "b5e16aae3d43919bb5899fade72c0550", - "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00b4ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740\u2013755. Springer (2014)", + "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3882,7 +3882,7 @@ { "type": "ListItem", "element_id": "8344e54a6acb25643c83b5ea96c5c593", - "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431\u20133440 (2015)", + "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3904,7 +3904,7 @@ { "type": "ListItem", "element_id": "9476b030857c32e55a638928df6d01e8", - "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Su\ufb01, S., Williams, A., Wolsten- croft, K.: An experimental work\ufb02ow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161\u2013168 (2011)", + "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Suï¬, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3926,7 +3926,7 @@ { "type": "ListItem", "element_id": "4640c3f33351b994165071b6d872ef56", - "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7\u201312. IEEE (2018)", + "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3970,7 +3970,7 @@ { "type": "ListItem", "element_id": "048415c6e5fc7bdd5466bf9c877b4a14", - "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic di\ufb00erentiation in pytorch (2017)", + "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4014,7 +4014,7 @@ { "type": "ListItem", "element_id": "a2f34eceb4f6036f105c6319de5450d1", - "text": "[25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257\u2013260. IEEE (2010)", + "text": "[25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257–260. IEEE (2010)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4036,7 +4036,7 @@ { "type": "ListItem", "element_id": "c81432ac5c76b82c1ccd93d0a3ee15b1", - "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572\u2013573 (2020)", + "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572–573 (2020)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4058,7 +4058,7 @@ { "type": "ListItem", "element_id": "0f5cebf6a7661981062a59f24e0b2a3a", - "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142\u2013147. IEEE (2019)", + "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142–147. IEEE (2019)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4080,7 +4080,7 @@ { "type": "ListItem", "element_id": "d02327f415141694d5853b57ac0f9e3f", - "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91\u201399 (2015)", + "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91–99 (2015)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4102,7 +4102,7 @@ { "type": "ListItem", "element_id": "d0529ef231eeac2e8ae2083dee416210", - "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61\u201380 (2008)", + "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61–80 (2008)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4124,7 +4124,7 @@ { "type": "ListItem", "element_id": "98fce7a2720ed7eda87a02659071b121", - "text": "[30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162\u20131167. IEEE (2017)", + "text": "[30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162–1167. IEEE (2017)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4146,7 +4146,7 @@ { "type": "ListItem", "element_id": "e3146a202c282ecab0d87f59d3307983", - "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548\u2013549 (2020)", + "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548–549 (2020)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4190,7 +4190,7 @@ { "type": "ListItem", "element_id": "7937fc115bcbbc8c08640587fa5ed827", - "text": "[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720\u2013725. IEEE (2019)", + "text": "[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4212,7 +4212,7 @@ { "type": "ListItem", "element_id": "881f67b82dccc13eaf96e912750c0318", - "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface\u2019s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)", + "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4300,7 +4300,7 @@ { "type": "ListItem", "element_id": "3ac304a6df305ec0a0bb9079795b6c2e", - "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for doc- ument layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015\u20131022. IEEE (Sep 2019). https://doi.org/10.1109/ICDAR.2019.00166", + "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for doc- ument layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015–1022. IEEE (Sep 2019). https://doi.org/10.1109/ICDAR.2019.00166", "metadata": { "filetype": "application/pdf", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json b/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json index 6ef9f4eb4b..df9b68e769 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json @@ -23,7 +23,7 @@ { "type": "UncategorizedText", "element_id": "f84bbc479d5bebf6b98c016e14d666d1", - "text": "\u00a9 1996 \u2013 2009 The Office of the High Commissioner for Human Rights", + "text": "© 1996 – 2009 The Office of the High Commissioner for Human Rights", "metadata": { "languages": [ "eng" @@ -108,7 +108,7 @@ { "type": "Title", "element_id": "84ce1bd66b09ce990ee385a04144822e", - "text": "\u662f\u4eba\u90fd\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u5f1f\u5144\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "是人都生而自由,在å°ä¸¥å’Œæƒåˆ©ä¸ä¸€å¾‹å¹³ç­‰ă€‚ä»–ä»¬èµ‹æœ‰ç†æ€§å’Œè‰¯å¿ƒï¼Œå¹¶åº”以弟兄关系ç„ç²¾ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho" @@ -266,7 +266,7 @@ { "type": "NarrativeText", "element_id": "dfabb35b82a82e16d7cb50d4de138e6f", - "text": "(Yeonbyeon) \uc0ac\ub78c\ub4e4\uc774 \uc774 \uc138\uacc4\ub85c \uc624\ub2e4\uac00 \ubaa8\ub450 \uc790\uc720\ud558\uace0, \uc874\uc5c4\uacfc \uad8c\ub9ac\uc774 \ud3c9\ub3d9\uc73c\ub85c \uc788\ub294\ub2e4, \uadf8\ub4e4 \ub9ac\uc131\uacfc \uc591\uc2ec\uc774 \uc788\ub208\uace0, \ud615\uc81c\uc758 \uc815\uc2e0\uc73c\ub85c \uc0c1\ud638\ub85c \uce58\ub8cc\ud558 \uc18c.", + "text": "(Yeonbyeon) ́‚¬ëŒë“¤́´ ́´ ́„¸ê³„로 ́˜¤ë‹¤ê°€ ëª¨ë‘ ́́œ í•˜ê³ , ́¡´́—„ê³¼ 권리́´ í‰ë™́œ¼ë¡œ ́ˆë”다, 그들 리́„±ê³¼ ́–‘́‹¬́´ ́ˆëˆˆê³ , 형́ œ́˜ ́ •́‹ ́œ¼ë¡œ ́ƒí˜¸ë¡œ ́¹˜ë£Œí•˜ ́†Œ.", "metadata": { "languages": [ "kor" @@ -287,7 +287,7 @@ { "type": "NarrativeText", "element_id": "1f41b7646ca8aebc36e8f5ec392481fb", - "text": "Abkhaz \u0414\u0430\u0440\u0431\u0430\u043d\u0437\u0430\u0430\u043b\u0430\u043a \u0430\u0443\u0430\u04a9\u044b \u0434\u0448\u043e\u0443\u043f \u0438\u0445\u044b \u0434\u0430\u049b\u04d9\u0438\u04ad\u043d\u044b. \u0410\u0443\u0430\u0430 \u0437\u0435\u0433\u044c \u0437\u0438\u043d\u043b\u0435\u0438 \u043f\u0430\u0442\u0443\u043b\u0435\u0438 \u0435\u0438\u049f\u0430\u0440\u043e\u0443\u043f. \u0423\u0440\u04ad \u0438\u0440\u044b\u043c\u043e\u0443\u043f \u0430\u0445\u0448\u044b\u04a9\u0438 \u0430\u043b\u0430\u043c\u044b\u0441\u0438, \u0434\u0430\u0440\u0430 \u0434\u0430\u0440\u0430\u0433\u044c \u0430\u0435\u0448\u044c\u0435\u0438 \u0430\u0435\u0448\u044c\u0435\u0438 \u0440\u0435\u0438\u04a7\u0448 \u0435\u0438\u0437\u044b\u049f\u0430\u0437\u0430\u0440\u043e\u0443\u043f.", + "text": "Abkhaz Đ”Đ°Ñ€Đ±Đ°Đ½Đ·Đ°Đ°Đ»Đ°Đº Đ°ÑƒĐ°̉©Ñ‹ Đ´ÑˆĐ¾ÑƒĐ¿ Đ¸Ñ…Ñ‹ да̉›Ó™Đ¸̉­Đ½Ñ‹. ĐÑƒĐ°Đ° Đ·ĐµĐ³ÑŒ Đ·Đ¸Đ½Đ»ĐµĐ¸ Đ¿Đ°Ñ‚ÑƒĐ»ĐµĐ¸ еи̉ŸĐ°Ñ€Đ¾ÑƒĐ¿. Đ£Ñ€̉­ Đ¸Ñ€Ñ‹Đ¼Đ¾ÑƒĐ¿ Đ°Ñ…ÑˆÑ‹̉©Đ¸ Đ°Đ»Đ°Đ¼Ñ‹Ñи, Đ´Đ°Ñ€Đ° Đ´Đ°Ñ€Đ°Đ³ÑŒ Đ°ĐµÑˆÑŒĐµĐ¸ Đ°ĐµÑˆÑŒĐµĐ¸ Ñ€ĐµĐ¸̉§Ñˆ ĐµĐ¸Đ·Ñ‹̉ŸĐ°Đ·Đ°Ñ€Đ¾ÑƒĐ¿.", "metadata": { "languages": [ "rus" @@ -372,7 +372,7 @@ { "type": "NarrativeText", "element_id": "7691e5f9dd37d6bc38044534196c1e9f", - "text": "Adyghe \u0426\u04cf\u044b\u0444 \u043f\u0441\u0442\u044d\u0443\u0440\u0438 \u0448\u044a\u0445\u044c\u044d\u0444\u0438\u0442\u044d\u0443, \u044f\u043b\u044a\u044b\u0442\u044d\u043d\u044b\u0433\u044a\u044d\u0440\u044d \u044f\u0444\u044d\u0448\u044a\u0443\u0430\u0448\u044d\u0445\u044d\u043c\u0440\u044d\u043a\u04cf\u044d \u0437\u044d\u0444\u044d\u0434\u044d\u0443 \u043a\u044a\u0430\u043b\u044a\u0444\u044b. \u0410\u043a\u044a\u044b\u043b\u0440\u044d \u0437\u044d\u0445\u044d\u0448\u04cf\u044b\u043a\u04cf \u0433\u044a\u0443\u0430\u0437\u044d\u0440\u044d \u044f\u04cf\u044d\u0448\u044a\u044b, \u0437\u044b\u0440 \u0437\u044b\u043c \u0437\u044d\u043a\u044a\u043e\u0448 \u0437\u044d\u0445\u0430\u0448\u0406\u044d \u0430\u0437\u0444\u0430\u0433\u0443 \u0434\u044d\u043b\u044a\u044d\u0443 \u0437\u044d\u0444\u044b\u0449\u044b\u0442\u044b\u043d\u0445\u044d \u0444\u0430\u0435.", + "text": "Adyghe ЦÓыф Đ¿ÑÑ‚ÑÑƒÑ€Đ¸ шÑÑ…ÑŒÑÑ„Đ¸Ñ‚Ñу, ÑĐ»ÑытÑĐ½Ñ‹Đ³ÑÑÑ€Ñ ÑÑ„ÑшÑÑƒĐ°ÑˆÑÑ…ÑĐ¼Ñ€ÑĐºÓÑ Đ·ÑÑ„ÑĐ´Ñу ĐºÑалÑфы. ĐĐºÑÑ‹Đ»Ñ€Ñ Đ·ÑÑ…ÑшÓÑ‹ĐºÓ Đ³ÑÑƒĐ°Đ·ÑÑ€Ñ ÑÓÑшÑÑ‹, Đ·Ñ‹Ñ€ Đ·Ñ‹Đ¼ Đ·ÑĐºÑĐ¾Ñˆ Đ·ÑÑ…Đ°ÑˆĐ†Ñ Đ°Đ·Ñ„Đ°Đ³Ñƒ Đ´ÑĐ»ÑÑу Đ·ÑÑ„Ñ‹Ñ‰Ñ‹Ñ‚Ñ‹Đ½Ñ…Ñ Ñ„Đ°Đµ.", "metadata": { "languages": [ "rus" @@ -460,7 +460,7 @@ { "type": "NarrativeText", "element_id": "20509f92f090bb4ecf694ea5b01d0921", - "text": "Aja Agbet\u0254wo ple\u014bu van\u0254 gb\u025bm\u025b ko vovo\u0256eka gbesw\u025bgbesw\u025b, s\u0254to am\u025bnyinyi ko ac\u025bwo gom\u025b; wo x\u0254n\u0254 susunywin ko jim\u025bnywi so esexwe. Wo \u0256o a w\u025b n\u0254vi \u0256a\u0256a wowo n\u0254n\u0254wo gb\u0254.", + "text": "Aja AgbetÉ”wo pleÅ‹u vanÉ” gbÉ›mÉ› ko vovoÉ–eka gbeswÉ›gbeswÉ›, sÉ”to amÉ›nyinyi ko acÉ›wo gomÉ›; wo xÉ”nÉ” susunywin ko jimÉ›nywi so esexwe. Wo É–o a wÉ› nÉ”vi É–aÉ–a wowo nÉ”nÉ”wo gbÉ”.", "metadata": { "languages": [ "afr", @@ -483,7 +483,7 @@ { "type": "NarrativeText", "element_id": "f6e32446c48b0755dfcf243a8142d613", - "text": "Albanian, Tosk T\u00eb gjith\u00eb njer\u00ebzit lindin t\u00eb lir\u00eb dhe t\u00eb barabart\u00eb n\u00eb dinjitet dhe n\u00eb t\u00eb drejta. Ata kan\u00eb arsye dhe nd\u00ebrgjegje dhe duhet t\u00eb sillen ndaj nj\u00ebri tjetrit me frym\u00eb v\u00ebllaz\u00ebrimi.", + "text": "Albanian, Tosk TĂ« gjithĂ« njerĂ«zit lindin tĂ« lirĂ« dhe tĂ« barabartĂ« nĂ« dinjitet dhe nĂ« tĂ« drejta. Ata kanĂ« arsye dhe ndĂ«rgjegje dhe duhet tĂ« sillen ndaj njĂ«ri tjetrit me frymĂ« vĂ«llazĂ«rimi.", "metadata": { "languages": [ "sqi" @@ -504,7 +504,7 @@ { "type": "NarrativeText", "element_id": "9a69378bfb3e4825a781de59826eff73", - "text": "Alemannisch (Elsassisch) \u00c0lli Mensche k\u00f9mme m\u00ect de gliche W\u00ecrde \u00f9n Rachte \u00f9ff d\u2019Walt. Sie h\u00e0n \u00e0lli Vern\u00f9nft \u00f9n Gew\u00ecsse \u00f9n selle m\u00ect Br\u00ecederlichkeit de \u00e0ndere gejjen\u00ecwwer h\u00e0ndle.", + "text": "Alemannisch (Elsassisch) Ă€lli Mensche kĂ¹mme mìt de gliche Wìrde Ă¹n Rachte Ă¹ff d’Walt. Sie hĂ n Ă lli VernĂ¹nft Ă¹n Gewìsse Ă¹n selle mìt Brìederlichkeit de Ă ndere gejjenìwwer hĂ ndle.", "metadata": { "languages": [ "deu" @@ -525,7 +525,7 @@ { "type": "NarrativeText", "element_id": "d5de29db1ca19f8ac33afb7049462513", - "text": "Altai, Southern \u041e\u043d\u0447\u043e \u0443\u043b\u0443\u0441 \u0430\u043a\u2010\u0458\u0430\u0440\u044b\u043a\u043a\u0430 \u0458\u0430\u0439\u044b\u043c \u043b\u0430 \u0442\u0435\u04a5\u2010\u0442\u0430\u0439 \u0442\u0430\u043f\u2010\u044d\u0440\u0438\u043a\u0442\u04f1 \u0442\u0443\u0443\u043b\u0430\u0442. \u041e\u043b\u043e\u0440 \u0441\u0430\u043d\u0430\u0430\u0443\u043a\u0430\u0430\u043b\u0443 \u043b\u0430 \u0447\u0435\u043a \u043a\u04f1\u04f1\u043d\u2010\u0442\u0430\u043f\u0442\u0443 \u0431\u043e\u043b\u0443\u043f \u0431\u04f1\u0442\u043a\u0435\u043d \u043b\u0435 \u0431\u043e\u0439\u2010\u0431\u043e\u0439\u044b\u043d \u043a\u0430\u0440\u044b\u043d\u0434\u0430\u0448 \u043a\u0438\u0440\u0435\u0437\u0438 \u043a\u04e7\u0440\u04e7\u0440 \u043b\u04e7 \u0458\u04f1\u0440\u0435\u0440 \u0443\u0447\u0443\u0440\u043b\u0443.", + "text": "Altai, Southern ĐĐ½Ñ‡Đ¾ ÑƒĐ»ÑƒÑ Đ°Đºâ€Ñ˜Đ°Ñ€Ñ‹ĐºĐºĐ° Ñ˜Đ°Đ¹Ñ‹Đ¼ ла Ñ‚Đµ̉¥â€Ñ‚Đ°Đ¹ Ñ‚Đ°Đ¿â€ÑÑ€Đ¸ĐºÑ‚Ó± Ñ‚ÑƒÑƒĐ»Đ°Ñ‚. ĐĐ»Đ¾Ñ€ ÑĐ°Đ½Đ°Đ°ÑƒĐºĐ°Đ°Đ»Ñƒ ла Ñ‡ĐµĐº ĐºÓ±Ó±Đ½â€Ñ‚Đ°Đ¿Ñ‚Ñƒ Đ±Đ¾Đ»ÑƒĐ¿ Đ±Ó±Ñ‚ĐºĐµĐ½ ле Đ±Đ¾Đ¹â€Đ±Đ¾Đ¹Ñ‹Đ½ ĐºĐ°Ñ€Ñ‹Đ½Đ´Đ°Ñˆ ĐºĐ¸Ñ€ĐµĐ·Đ¸ ĐºÓ§Ñ€Ó§Ñ€ лӧ Ñ˜Ó±Ñ€ĐµÑ€ ÑƒÑ‡ÑƒÑ€Đ»Ñƒ.", "metadata": { "languages": [ "rus", @@ -570,7 +570,7 @@ { "type": "NarrativeText", "element_id": "d0963c28613cf0e49ccc8378af7f29b7", - "text": "Amarakaeri Aya'da aratbut katepi' eka'ta' on'pakpo ka'dik o\u0331'ne. Nog aratbut huadak o\u0331'nepo ko\u0331nigti opudo\u0331mey huadak mo'e\u0331. Aya'da huadak eka' nopoe\u0331'dik o\u0331'ne kenpa'ti dakhuea' eka' nopoe\u0331'dik o\u0331'ne kenpa'ti ko\u0331nig huama'buytaj o 'tihuapokika' ko\u0331nigti nogo\u0331meytaj tihuapokika 'dik o\u0331'ne.", + "text": "Amarakaeri Aya'da aratbut katepi' eka'ta' on'pakpo ka'dik ò±'ne. Nog aratbut huadak ò±'nepo kò±nigti opudò±mey huadak mo'è±. Aya'da huadak eka' nopoè±'dik ò±'ne kenpa'ti dakhuea' eka' nopoè±'dik ò±'ne kenpa'ti kò±nig huama'buytaj o 'tihuapokika' kò±nigti nogò±meytaj tihuapokika 'dik ò±'ne.", "metadata": { "languages": [ "ind" @@ -612,7 +612,7 @@ { "type": "Title", "element_id": "8c8d0d9098a83b293045f03fbe07358d", - "text": "\u12e8\u1230\u12cd\u1361\u120d\u1305\u1361\u1201\u1209\u1361\u1232\u12c8\u1208\u12f5\u1361\u1290\u133b\u1293\u1361\u1260\u12ad\u1265\u122d\u1293\u1361\u1260\u1218\u1265\u1275\u121d\u1361\u12a5\u12a9\u120d\u1290\u1275\u1361\u12eb\u1208\u12cd\u1361\u1290\u12cd\u1362\u1361\u12e8\u1270\u1348\u1325\u122e\u1361\u121b\u1235\u1270\u12cb\u120d\u1293\u1361\u1215\u120a\u1293\u1361\u1235\u120b\u1208\u12cd\u1361\u12a0\u1295\u12f1\u1361\u120c\u120b\u12cd\u1295\u1361\u1260\u12c8\u1295\u12f5\u121b\u121b\u127d\u1290\u1275\u1361\u1218\u1295\u1348\u1235\u1361\u1218\u1218\u120d\u12a8\u1275\u1361\u12ed\u1308\u1263\u12cb\u120d\u1362", + "text": "የሰá‹á¡áˆáŒ…á¡áˆáˆ‰á¡áˆ²á‹ˆáˆˆá‹µá¡áጻá“á¡á‰ á­á‰¥áˆ­á“á¡á‰ áˆ˜á‰¥á‰µáˆá¡á¥á©áˆáትá¡á‹«áˆˆá‹á¡áá‹á¢á¡á‹¨á‰°áˆáŒ¥áˆ®á¡áˆ›áˆµá‰°á‹‹áˆá“á¡áˆ•áˆá“á¡áˆµáˆ‹áˆˆá‹á¡á á•ዱá¡áˆŒáˆ‹á‹á•á¡á‰ á‹ˆá•ድማማችáትá¡áˆ˜á•áˆáˆµá¡áˆ˜áˆ˜áˆá¨á‰µá¡á‹­áŒˆá‰£á‹‹áˆá¢", "metadata": { "filetype": "text/plain", "data_source": { @@ -675,7 +675,7 @@ { "type": "NarrativeText", "element_id": "e1a81a0e10a38df3526fc4432de66ad3", - "text": "Arabic, Standard \u064a\u0648\u0644\u062f \u062c\u0645\u064a\u0639 \u0627\u0644\u0646\u0627\u0633 \u0623\u062d\u0631\u0627\u0631\u064b\u0627 \u0645\u062a\u0633\u0627\u0648\u064a\u0646 \u0641\u064a \u0627\u0644\u0643\u0631\u0627\u0645\u0629 \u0648\u0627\u0644\u062d\u0642\u0648\u0642. \u0648\u0642\u062f \u0648\u0647\u0628\u0648\u0627 \u0639\u0642\u0644\u0627\u064b \u0648\u0636\u0645\u064a\u0631\u064b\u0627 \u0648\u0639\u0644\u064a\u0647\u0645 \u0623\u0646 \u064a\u0639\u0627\u0645\u0644 \u0628\u0639\u0636\u0647\u0645 \u0628\u0639\u0636\u064b\u0627 \u0628\u0631\u0648\u062d \u0627\u0644\u0625\u062e\u0627\u0621.", + "text": "Arabic, Standard Ùولد Ø¬Ù…ÙØ¹ الناس أحرارًا متساوÙÙ† Ù٠الكرامة والحقوق. وقد وهبوا عقلاً ÙˆØ¶Ù…ÙØ±Ù‹Ø§ وعلÙهم أن ÙØ¹Ø§Ù…Ù„ بعضهم بعضًا بروح الإخاء.", "metadata": { "languages": [ "ara" @@ -696,7 +696,7 @@ { "type": "UncategorizedText", "element_id": "72d099b2761f12d204f35cc85600f8dd", - "text": "Armenian \u0532\u0578\u056c\u0578\u0580 \u0574\u0561\u0580\u0564\u056b\u056f \u056e\u0576\u057e\u0578\u0582\u0574 \u0565\u0576 \u0561\u0566\u0561\u057f \u0578\u0582 \u0570\u0561\u057e\u0561\u057d\u0561\u0580 \u056b\u0580\u0565\u0576\u0581 \u0561\u0580\u056a\u0561\u0576\u0561\u057a\u0561\u057f\u057e\u0578\u0582\u0569\u0575\u0561\u0574\u0562 \u0578\u0582 \u056b\u0580\u0561\u057e\u0578\u0582\u0576\u0584\u0576\u0565\u0580\u0578\u057e\u0589 \u0546\u0580\u0561\u0576\u0584 \u0578\u0582\u0576\u0565\u0576 \u0562\u0561\u0576\u0561\u056f\u0561\u0576\u0578\u0582\u0569\u0575\u0578\u0582\u0576 \u0578\u0582 \u056d\u056b\u0572\u0573 \u0587 \u0574\u056b\u0574\u0575\u0561\u0576\u0581 \u057a\u0565\u057f\u0584 \u0567 \u0565\u0572\u0562\u0561\u0575\u0580\u0561\u0562\u0561\u0580 \u057e\u0565\u0580\u0561\u0562\u0565\u0580\u057e\u0565\u0576\u0589", + "text": "Armenian Ô²Ơ¸Ơ¬Ơ¸Ö€ Ơ´Ơ¡Ö€Ơ¤Ơ«Ơ¯ Ơ®Ơ¶Ơ¾Ơ¸Ö‚Ơ´ Ơ¥Ơ¶ Ơ¡Ơ¦Ơ¡Ơ¿ Ơ¸Ö‚ Ơ°Ơ¡Ơ¾Ơ¡Ơ½Ơ¡Ö€ Ơ«Ö€Ơ¥Ơ¶Ö Ơ¡Ö€ƠªƠ¡Ơ¶Ơ¡ƠºƠ¡Ơ¿Ơ¾Ơ¸Ö‚Ơ©ƠµƠ¡Ơ´Ơ¢ Ơ¸Ö‚ Ơ«Ö€Ơ¡Ơ¾Ơ¸Ö‚Ơ¶Ö„Ơ¶Ơ¥Ö€Ơ¸Ơ¾Ö‰ Ơ†Ö€Ơ¡Ơ¶Ö„ Ơ¸Ö‚Ơ¶Ơ¥Ơ¶ Ơ¢Ơ¡Ơ¶Ơ¡Ơ¯Ơ¡Ơ¶Ơ¸Ö‚Ơ©ƠµƠ¸Ö‚Ơ¶ Ơ¸Ö‚ Ơ­Ơ«Ơ²Ơ³ Ö‡ Ơ´Ơ«Ơ´ƠµƠ¡Ơ¶Ö ƠºƠ¥Ơ¿Ö„ Ơ§ Ơ¥Ơ²Ơ¢Ơ¡ƠµÖ€Ơ¡Ơ¢Ơ¡Ö€ Ơ¾Ơ¥Ö€Ơ¡Ơ¢Ơ¥Ö€Ơ¾Ơ¥Ơ¶Ö‰", "metadata": { "languages": [ "est" @@ -717,7 +717,7 @@ { "type": "NarrativeText", "element_id": "38291b67d0eaef665797206e43651164", - "text": "Aromanian Tuti iats\u00e2li umineshts\u00e2 s-fac liberi shi egali la n\u00e2muzea shi-ndrepturli. Eali suntu h\u00e2rziti cu fichiri shi sinidisi shi lipseashti un cu alantu sh-si poart\u00e2 tu duhlu-a fr\u00e2ts\u00e2ljiljei.", + "text": "Aromanian Tuti iatsĂ¢li umineshtsĂ¢ s-fac liberi shi egali la nĂ¢muzea shi-ndrepturli. Eali suntu hĂ¢rziti cu fichiri shi sinidisi shi lipseashti un cu alantu sh-si poartĂ¢ tu duhlu-a frĂ¢tsĂ¢ljiljei.", "metadata": { "languages": [ "ron", @@ -739,7 +739,7 @@ { "type": "NarrativeText", "element_id": "6bb51b6b82df3d4800c98e8415754489", - "text": "Ash\u00e1ninka Aquempetavacaajeita maaroni atiri. Timatsi aquenqueshirejeitantari maaroni, timatsi amejeitari, ayojeiti paitarica ocameetsati antajeitiri: te oncameetsateji intsaneapitsajeiteero itsipapee. Te oncameetsateji imperanajeitee, te oncameetsateji iroashinoncaajeitee, irointi ocameetsati aacameetsatavacaajeitea.", + "text": "AshĂ¡ninka Aquempetavacaajeita maaroni atiri. Timatsi aquenqueshirejeitantari maaroni, timatsi amejeitari, ayojeiti paitarica ocameetsati antajeitiri: te oncameetsateji intsaneapitsajeiteero itsipapee. Te oncameetsateji imperanajeitee, te oncameetsateji iroashinoncaajeitee, irointi ocameetsati aacameetsatavacaajeitea.", "metadata": { "languages": [ "fin", @@ -762,7 +762,7 @@ { "type": "NarrativeText", "element_id": "ef818e559e5b9629b3da213d71f6d693", - "text": "Ash\u00e9ninka, Pichis Maaroni atziripayeeni, ovaquera intzimapaaque, eero ocantzi i\u00f1aashitacaavaitaityaari iromperanataityaari. Eejatzi oquemitari iro\u00f1aaca te apantyaaro amanitashireteri atziri ancanteri: \"Te pirjiperote eeroca, iriima irinta iriitaque \u00f1aaperori\". Eejatzi oquemitari te oncameethate intzime aparoni atziri antayetashityaarone caari ishinetaacairi pashine irantero. Tema maaroni ayotziro ampampithashirvaayeta, ayotziro tsicarica otzimayetzi cameethatatsiri anteri o tsicarica otzimi caariperotatsiri, irootaque ocovaperotantari iro\u00f1aaca entacotavacaayetya anquemitacaantanaquero arentzitavacaatyeeyaami ocaaquiini.", + "text": "AshĂ©ninka, Pichis Maaroni atziripayeeni, ovaquera intzimapaaque, eero ocantzi iñaashitacaavaitaityaari iromperanataityaari. Eejatzi oquemitari iroñaaca te apantyaaro amanitashireteri atziri ancanteri: \"Te pirjiperote eeroca, iriima irinta iriitaque ñaaperori\". Eejatzi oquemitari te oncameethate intzime aparoni atziri antayetashityaarone caari ishinetaacairi pashine irantero. Tema maaroni ayotziro ampampithashirvaayeta, ayotziro tsicarica otzimayetzi cameethatatsiri anteri o tsicarica otzimi caariperotatsiri, irootaque ocovaperotantari iroñaaca entacotavacaayetya anquemitacaantanaquero arentzitavacaatyeeyaami ocaaquiini.", "metadata": { "languages": [ "ita", @@ -785,7 +785,7 @@ { "type": "NarrativeText", "element_id": "5cb0bb4fdc15b35295973bd4a2247bd1", - "text": "Assyrian Neo-Aramaic \u071f\u0720 \u0712\u072a\u0722\u072b\u0710 \u0712\u072a\u071d\u0720\u0717 \u071a\u0710\u072a\u0710 \u0718\u0712\u072a\u0712\u072a \u0713\u0718 \u0710\u071d\u0729\u072a\u0710 \u0718\u0719\u0715\u0729\u0710. \u0718\u0726\u071d\u072b\u071d\u0720\u0717 \u071d\u0717\u0712\u0710 \u0717\u0718\u0722\u0710 \u0718\u0710\u0722\u071d\u072c. \u0712\u0718\u0715 \u0715\u0710\u0717\u0710 \u0713\u072b\u0729\u072c\u071d \u0725\u0720 \u0710\u071a\u072a\u0722\u0710 \u0713\u072a\u0713 \u0717\u0718\u071d\u0710 \u0712\u071a\u0715 \u072a\u0718\u071a\u0710 \u0715\u0710\u071a\u0722\u0718\u072c\u0710.", + "text": "Assyrian Neo-Aramaic ÜŸÜ  Ü’ÜªÜ¢Ü«Ü Ü’ÜªÜÜ Ü— ÜÜÜªÜ Ü˜Ü’ÜªÜ’Üª ܓܘ ÜÜÜ©ÜªÜ Ü˜Ü™Ü•Ü©Ü. ܘܦÜÜ«ÜÜ Ü— ÜÜ—Ü’Ü Ü—Ü˜Ü¢Ü Ü˜ÜÜ¢Üܬ. ܒܘܕ Ü•ÜÜ—Ü Ü“Ü«Ü©Ü¬Ü Ü¥Ü  ÜÜÜªÜ¢Ü Ü“ÜªÜ“ ܗܘÜÜ Ü’ÜÜ• ܪܘÜÜ Ü•ÜÜܢܘܬÜ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -803,7 +803,7 @@ { "type": "NarrativeText", "element_id": "fc37a0c903b4ad45223fa0a367de3b9b", - "text": "Asturian Tolos seres humanos nacen llibres y iguales en dignid\u00e1 y drechos y, pola mor de la raz\u00f3n y la conciencia de so, han comportase hermaniblemente los unos colos otros.", + "text": "Asturian Tolos seres humanos nacen llibres y iguales en dignidĂ¡ y drechos y, pola mor de la razĂ³n y la conciencia de so, han comportase hermaniblemente los unos colos otros.", "metadata": { "languages": [ "spa" @@ -866,7 +866,7 @@ { "type": "UncategorizedText", "element_id": "4e13c433d775a93f0bb6c40cbb2d5a03", - "text": "Aymara, Central Taqpach jaqejh khuskat u\u00f1jatat\u00e4pjhewa muna\u00f1apansa, lura\u00f1apansa, amuyasi\u00f1apansa, ukatwa jilani sullkan\u00edpjhaspas ukham u\u00f1jasipjha\u00f1apawa.", + "text": "Aymara, Central Taqpach jaqejh khuskat uñjatatäpjhewa munañapansa, lurañapansa, amuyasiñapansa, ukatwa jilani sullkanĂ­pjhaspas ukham uñjasipjhañapawa.", "metadata": { "languages": [ "swa", @@ -889,7 +889,7 @@ { "type": "NarrativeText", "element_id": "8afc3caab3e458628b6f2efdb46fc6d1", - "text": "Azerbaijani, North (Cyrillic) \u0411\u04af\u0442\u04af\u043d \u0438\u043d\u0441\u0430\u043d\u043b\u0430\u0440 \u043b\u04d9\u0458\u0430\u0433\u04d9\u0442 \u0432\u04d9 \u04bb\u04af\u0433\u0443\u0433\u043b\u0430\u0440\u044b\u043d\u0430 \u049d\u04e9\u0440\u04d9 \u0430\u0437\u0430\u0434 \u0432\u04d9 \u0431\u04d9\u0440\u0430\u0431\u04d9\u0440 \u0434\u043e\u0493\u0443\u043b\u0443\u0440\u043b\u0430\u0440. \u041e\u043d\u043b\u0430\u0440\u044b\u043d \u0448\u04af\u0443\u0440\u043b\u0430\u0440\u044b \u0432\u04d9 \u0432\u0438\u04b9\u0434\u0430\u043d\u043b\u0430\u0440\u044b \u0432\u0430\u0440 \u0432\u04d9 \u0431\u0438\u0440-\u0431\u0438\u0440\u043b\u04d9\u0440\u0438\u043d\u04d9 \u043c\u04af\u043d\u0430\u0441\u0438\u0431\u04d9\u0442\u0434\u04d9 \u0433\u0430\u0440\u0434\u0430\u0448\u043b\u044b\u0433 \u0440\u0443\u04bb\u0443\u043d\u0434\u0430 \u0434\u0430\u0432\u0440\u0430\u043d\u043c\u0430\u043b\u044b\u0434\u044b\u0440\u043b\u0430\u0440.", + "text": "Azerbaijani, North (Cyrillic) Đ‘̉¯Ñ‚̉¯Đ½ Đ¸Đ½ÑĐ°Đ½Đ»Đ°Ñ€ Đ»Ó™Ñ˜Đ°Đ³Ó™Ñ‚ Đ²Ó™ ̉»̉¯Đ³ÑƒĐ³Đ»Đ°Ñ€Ñ‹Đ½Đ° ̉Ó©Ñ€Ó™ азад Đ²Ó™ Đ±Ó™Ñ€Đ°Đ±Ó™Ñ€ Đ´Đ¾̉“ÑƒĐ»ÑƒÑ€Đ»Đ°Ñ€. ĐĐ½Đ»Đ°Ñ€Ñ‹Đ½ ш̉¯ÑƒÑ€Đ»Đ°Ñ€Ñ‹ Đ²Ó™ Đ²Đ¸̉¹Đ´Đ°Đ½Đ»Đ°Ñ€Ñ‹ Đ²Đ°Ñ€ Đ²Ó™ Đ±Đ¸Ñ€-Đ±Đ¸Ñ€Đ»Ó™Ñ€Đ¸Đ½Ó™ Đ¼̉¯Đ½Đ°ÑĐ¸Đ±Ó™Ñ‚Đ´Ó™ Đ³Đ°Ñ€Đ´Đ°ÑˆĐ»Ñ‹Đ³ ру̉»ÑƒĐ½Đ´Đ° Đ´Đ°Đ²Ñ€Đ°Đ½Đ¼Đ°Đ»Ñ‹Đ´Ñ‹Ñ€Đ»Đ°Ñ€.", "metadata": { "languages": [ "rus", @@ -911,7 +911,7 @@ { "type": "NarrativeText", "element_id": "6d9f8766b1812e209f1a59654443299c", - "text": "Azerbaijani, North (Latin) B\u00fct\u00fcn insanlar l\u0259yaq\u0259t v\u0259 h\u00fcquqlar\u0131na g\u00f6r\u0259 azad v\u0259 b\u0259rab\u0259r do\u011fulurlar. Onlar\u0131n \u015f\u00fcurlar\u0131 v\u0259 vicdanlar\u0131 var v\u0259 bir-birl\u0259rin\u0259 m\u00fcnasib\u0259td\u0259 qarda\u015fl\u0131q ruhunda davranmal\u0131d\u0131rlar.", + "text": "Azerbaijani, North (Latin) BĂ¼tĂ¼n insanlar lÉ™yaqÉ™t vÉ™ hĂ¼quqlarına görÉ™ azad vÉ™ bÉ™rabÉ™r doÄŸulurlar. Onların ÅŸĂ¼urları vÉ™ vicdanları var vÉ™ bir-birlÉ™rinÉ™ mĂ¼nasibÉ™tdÉ™ qardaÅŸlıq ruhunda davranmalıdırlar.", "metadata": { "languages": [ "tur" @@ -932,7 +932,7 @@ { "type": "NarrativeText", "element_id": "3681d23b771b9cf26263ab194af3430d", - "text": "Baatonum Ba t\u0254mbu kpuro marawa ba tii m\u0254, ba n\u025b, girima ka saria s\u0254\u0254. Ba ra bwisiku, ba dasabu m\u0254, ma n weene ba n waasin\u025b m\u025brobisiru s\u0254\u0254.", + "text": "Baatonum Ba tÉ”mbu kpuro marawa ba tii mÉ”, ba nÉ›, girima ka saria sɔɔ. Ba ra bwisiku, ba dasabu mÉ”, ma n weene ba n waasinÉ› mÉ›robisiru sɔɔ.", "metadata": { "languages": [ "som", @@ -975,7 +975,7 @@ { "type": "NarrativeText", "element_id": "394114d333ed34e0add89b5e9079d474", - "text": "Bamanankan Hadamaden b\u025b\u025b danmak\u025b\u0272\u025bnen b\u025b bange, danbe ni josira la. Hakili ni taasi b\u2019u b\u025b\u025b la, wa u ka kan ka baden\u0272asira de waleya u ni \u0272\u0254g\u0254n c\u025b.", + "text": "Bamanankan Hadamaden bɛɛ danmakɛɲɛnen bÉ› bange, danbe ni josira la. Hakili ni taasi b’u bɛɛ la, wa u ka kan ka badenɲasira de waleya u ni ɲɔgÉ”n cÉ›.", "metadata": { "languages": [ "som", @@ -998,7 +998,7 @@ { "type": "NarrativeText", "element_id": "31e2922fd7a67918fa2a09744965a970", - "text": "Bamun Pe na\u0302 mve\u0301 gu\u0301 puen nyu\u0308tu po\u0302 te mbe ku\u0301 ghe\u0301t ngam pua ngu\u0301enengu\u0301e mbe te wu\u0302me nsebe pua pa mfe\u0301e\u0301ke\u0302t. Pen a\u0302 ntu\u0301m te mbe ku\u0301 rem ngam pua fabshe ngam, a nshi nji\u0302r\u2019ap ne yi nsha\u0302ne nge\u0301tne nga shap po\u0302 te wupme ponta\u0302.", + "text": "Bamun Pe nà‚ mvè gù puen nyùˆtu pò‚ te mbe kù ghèt ngam pua ngùenengùe mbe te wù‚me nsebe pua pa mfèèkè‚t. Pen à‚ ntùm te mbe kù rem ngam pua fabshe ngam, a nshi njì‚r’ap ne yi nshà‚ne ngètne nga shap pò‚ te wupme pontà‚.", "metadata": { "languages": [ "sqi" @@ -1019,7 +1019,7 @@ { "type": "NarrativeText", "element_id": "c5815bd56d9b0f7114cfa825514698ca", - "text": "Baoul\u00e9 Sran mun be ngba, k\u025b be wu be \u0254, be ngba be s\u025b, f\u0254ndi nun, sran-mmala nun. Be si akundanbu, be si su \u0254 fata k\u025b sran mun be tran'n, be tran aniaan nun tranl\u025b.", + "text": "BaoulĂ© Sran mun be ngba, kÉ› be wu be É”, be ngba be sÉ›, fÉ”ndi nun, sran-mmala nun. Be si akundanbu, be si su É” fata kÉ› sran mun be tran'n, be tran aniaan nun tranlÉ›.", "metadata": { "languages": [ "ind" @@ -1040,7 +1040,7 @@ { "type": "NarrativeText", "element_id": "f937bd218ac832a520fee7be14b4e89c", - "text": "Bari \u014autu li\u014b a yu\u014bwe kana, jojo i to\u010firi ko \u010fekesi ko ti se tokitaki ko \u2018b\u00f6rik ko mul\u00f6k\u00f6tyo lo tolu\u014baseran. Se a \u010foka ko denet na kulya na\u2019but ko narok.", + "text": "Bari Åutu liÅ‹ a yuÅ‹we kana, jojo i toÄiri ko Äekesi ko ti se tokitaki ko ‘börik ko mulökötyo lo toluÅ‹aseran. Se a Äoka ko denet na kulya na’but ko narok.", "metadata": { "languages": [ "hrv", @@ -1085,7 +1085,7 @@ { "type": "NarrativeText", "element_id": "5ce714cfa1def0c0d951bf7bff485500", - "text": "Belarusan \u0423\u0441\u0435 \u043b\u044e\u0434\u0437\u0456 \u043d\u0430\u0440\u0430\u0434\u0436\u0430\u044e\u0446\u0446\u0430 \u0441\u0432\u0430\u0431\u043e\u0434\u043d\u044b\u043c\u0456 \u0456 \u0440\u043e\u045e\u043d\u044b\u043c\u0456 \u045e \u0441\u0432\u0430\u0451\u0439 \u0433\u043e\u0434\u043d\u0430\u0441\u0446\u0456 \u0456 \u043f\u0440\u0430\u0432\u0430\u0445. \u042f\u043d\u044b \u043d\u0430\u0434\u0437\u0435\u043b\u0435\u043d\u044b \u0440\u043e\u0437\u0443\u043c\u0430\u043c \u0456 \u0441\u0443\u043c\u043b\u0435\u043d\u043d\u0435\u043c \u0456 \u043f\u0430\u0432\u0456\u043d\u043d\u044b \u0441\u0442\u0430\u0432\u0456\u0446\u0446\u0430 \u0430\u0434\u0437\u0456\u043d \u0434\u0430 \u0430\u0434\u043d\u0430\u0433\u043e \u045e \u0434\u0443\u0445\u0443 \u0431\u0440\u0430\u0446\u0442\u0432\u0430.", + "text": "Belarusan Đ£Ñе Đ»ÑĐ´Đ·Ñ– Đ½Đ°Ñ€Đ°Đ´Đ¶Đ°ÑÑ†Ñ†Đ° ÑĐ²Đ°Đ±Đ¾Đ´Đ½Ñ‹Đ¼Ñ– Ñ– Ñ€Đ¾ÑĐ½Ñ‹Đ¼Ñ– Ñ ÑĐ²Đ°Ñ‘Đ¹ Đ³Đ¾Đ´Đ½Đ°Ñці Ñ– Đ¿Ñ€Đ°Đ²Đ°Ñ…. Đ¯Đ½Ñ‹ Đ½Đ°Đ´Đ·ĐµĐ»ĐµĐ½Ñ‹ Ñ€Đ¾Đ·ÑƒĐ¼Đ°Đ¼ Ñ– ÑÑƒĐ¼Đ»ĐµĐ½Đ½ĐµĐ¼ Ñ– Đ¿Đ°Đ²Ñ–Đ½Đ½Ñ‹ ÑÑ‚Đ°Đ²Ñ–Ñ†Ñ†Đ° Đ°Đ´Đ·Ñ–Đ½ да Đ°Đ´Đ½Đ°Đ³Đ¾ Ñ Đ´ÑƒÑ…Ñƒ Đ±Ñ€Đ°Ñ†Ñ‚Đ²Đ°.", "metadata": { "languages": [ "ukr", @@ -1128,7 +1128,7 @@ { "type": "UncategorizedText", "element_id": "bb5acaee87121a890d36cb7afd3ad15a", - "text": "Bengali \u09b8\u09ae\u09b8\u09cd\u09a4 \u09ae\u09be\u09a8\u09c1\u09b7 \u09b8\u09cd\u09ac\u09be\u09a7\u09c0\u09a8\u09ad\u09be\u09ac\u09c7 \u09b8\u09ae\u09be\u09a8 \u09ae\u09b0\u09cd\u09af\u09be\u09a6\u09be \u098f\u09ac\u0982 \u0985\u09a7\u09bf\u0995\u09be\u09b0 \u09a8\u09bf\u09af\u09bc\u09c7 \u099c\u09a8\u09cd\u09ae\u0997\u09cd\u09b0\u09b9\u09a3 \u0995\u09b0\u09c7\u0964 \u09a4\u09be\u0981\u09a6\u09c7\u09b0 \u09ac\u09bf\u09ac\u09c7\u0995 \u098f\u09ac\u0982 \u09ac\u09c1\u09a6\u09cd\u09a7\u09bf \u0986\u099b\u09c7; \u09b8\u09c1\u09a4\u09b0\u09be\u0982 \u09b8\u0995\u09b2\u09c7\u09b0\u0987 \u098f\u0995\u09c7 \u0985\u09aa\u09b0\u09c7\u09b0 \u09aa\u09cd\u09b0\u09a4\u09bf \u09ad\u09cd\u09b0\u09be\u09a4\u09c3\u09a4\u09cd\u09ac\u09b8\u09c1\u09b2\u09ad \u09ae\u09a8\u09cb\u09ad\u09be\u09ac \u09a8\u09bf\u09df\u09c7 \u0986\u099a\u09b0\u09a3 \u0995\u09b0\u09be \u0989\u099a\u09bf\u09a4\u0964", + "text": "Bengali সমসà§à¦¤ মানà§à¦· সà§à¦¬à¦¾à¦§à§€à¦¨à¦­à¦¾à¦¬à§‡ সমান মরà§à¦¯à¦¾à¦¦à¦¾ à¦à¦¬à¦‚ অধিকার নিয়ে জনà§à¦®à¦—à§à¦°à¦¹à¦£ করে। তাà¦à¦¦à§‡à¦° বিবেক à¦à¦¬à¦‚ বà§à¦¦à§à¦§à¦¿ আছে; সà§à¦¤à¦°à¦¾à¦‚ সকলেরই à¦à¦•ে অপরের পà§à¦°à¦¤à¦¿ ভà§à¦°à¦¾à¦¤à§ƒà¦¤à§à¦¬à¦¸à§à¦²à¦­ মনোভাব নিয়ে আà¦à¦°à¦£ করা উà¦à¦¿à¦¤à¥¤", "metadata": { "languages": [ "ben" @@ -1149,7 +1149,7 @@ { "type": "UncategorizedText", "element_id": "d5919948b12c6b7e2c5179487170dd51", - "text": "Bhojpuri \u0938\u092c\u0939\u093f \u0932\u094b\u0915\u093e\u0928\u093f \u0906\u091c\u093e\u0926\u0947 \u091c\u092e\u094d\u092e\u0947\u0932\u093e \u0906\u0913\u0930 \u0913\u0916\u093f\u0928\u093f\u092f\u094b \u0915\u0947 \u092c\u0930\u093e\u092c\u0930 \u0938\u092e\u094d\u092e\u093e\u0928 \u0906\u0913\u0930 \u0905\u0927\u093f\u0915\u093e\u0930 \u092a\u094d\u0930\u093e\u092a\u094d\u0924 \u0939\u0935\u0947\u0964 \u0913\u0916\u093f\u0928\u093f\u092f\u094b \u0915\u0947 \u092a\u093e\u0938 \u0938\u092e\u091d-\u092c\u0942\u091d \u0906\u0913\u0930 \u0905\u0902\u0924:\u0915\u0930\u0923 \u0915\u0947 \u0906\u0935\u093e\u091c \u0939\u094b\u0916\u0924\u093e \u0906\u0913\u0930 \u0939\u0941\u0928\u0915\u094b \u0915\u0947 \u0926\u094b\u0938\u0930\u093e \u0915\u0947 \u0938\u093e\u0925 \u092d\u093e\u0908\u091a\u093e\u0930\u093e \u0915\u0947 \u092c\u0947\u0935\u0939\u093e\u0930 \u0915\u0930\u0947 \u0915\u0947 \u0939\u094b\u0916\u0932\u093e\u0964", + "text": "Bhojpuri सबहि लोकानि आजादे जमà¥à¤®à¥‡à¤²à¤¾ आओर ओखिनियो के बराबर समà¥à¤®à¤¾à¤¨ आओर अधिकार पà¥à¤°à¤¾à¤ªà¥à¤¤ हवे। ओखिनियो के पास समà¤-बूठआओर अंत:करण के आवाज होखता आओर हà¥à¤¨à¤•ो के दोसरा के साथ भाईà¤à¤¾à¤°à¤¾ के बेवहार करे के होखला।", "metadata": { "languages": [ "hin" @@ -1214,7 +1214,7 @@ { "type": "NarrativeText", "element_id": "09176e19ded6b0ff879ead0799cc2302", - "text": "Bora P\u00e1meere \u00ed\u00ed\u00f1\u00faj\u0268ri me\u00edjcyame ts\u00e1 m\u00fah\u00f3j\u0268\u0301s\u0268\u0301 pa\u00f1\u00e9 \u0268\u0301cub\u00e1hr\u00e1d\u00fa me\u00edjcy\u00e1\u00edtyur\u00f3ne. P\u00e1meere tsahd\u00far\u00e9 im\u00ed me\u00edjcyame mew\u00e1jy\u00fajcats\u00ed\u00f1e m\u00e9p\u0268\u0301\u00e1\u00e1b\u00f3jcats\u00ediy\u00e1 tsaat\u00e9k\u00e9 \u00e9hd\u0268\u0301\u0208\u0301v\u00e1llet\u00fam\u00e9 \u00e9hne m\u00fau m\u00e9pa\u00f1\u00e9t\u00fa\u00e9n\u00e9 nahb\u00e9muma me\u00edjcyadu.", + "text": "Bora PĂ¡meere Ă­Ă­Ă±Ăºjɨri meĂ­jcyame tsĂ¡ mĂºhĂ³jɨ̀sɨ̀ pañé ɨ̀cubĂ¡hrĂ¡dĂº meĂ­jcyĂ¡Ă­tyurĂ³ne. PĂ¡meere tsahdĂºrĂ© imĂ­ meĂ­jcyame mewĂ¡jyĂºjcatsíñe mĂ©pɨ̀Ă¡Ă¡bĂ³jcatsĂ­iyĂ¡ tsaatĂ©kĂ© Ă©hdɨ̀Ȉ̀vĂ¡lletĂºmĂ© Ă©hne mĂºu mĂ©pañétĂºĂ©nĂ© nahbĂ©muma meĂ­jcyadu.", "metadata": { "languages": [ "hun" @@ -1235,7 +1235,7 @@ { "type": "NarrativeText", "element_id": "5e3ff47fa6202cd3f10a179ea2b898e3", - "text": "Bosnian (Cyrillic) \u0421\u0432\u0430 \u0459\u0443\u0434\u0441\u043a\u0430 \u0431\u0438\u045b\u0430 \u0440\u0430\u045b\u0430\u0458\u0443 \u0441\u0435 \u0441\u043b\u043e\u0431\u043e\u0434\u043d\u0430 \u0438 \u0458\u0435\u0434\u043d\u0430\u043a\u0430 \u0443 \u0434\u043e\u0441\u0442\u043e\u0458\u0430\u043d\u0441\u0442\u0432\u0443 \u0438 \u043f\u0440\u0430\u0432\u0438\u043c\u0430. \u041e\u043d\u0430 \u0441\u0443 \u043e\u0431\u0434\u0430\u0440\u0435\u043d\u0430 \u0440\u0430\u0437\u0443\u043c\u043e\u043c \u0438 \u0441\u0432\u0438\u0458\u0435\u0448\u045b\u0443 \u0438 \u0442\u0440\u0435\u0431\u0430 \u0434\u0430 \u0458\u0435\u0434\u043d\u043e \u043f\u0440\u0435\u043c\u0430 \u0434\u0440\u0443\u0433\u043e\u043c\u0435 \u043f\u043e\u0441\u0442\u0443\u043f\u0430\u0458\u0443 \u0443 \u0434\u0443\u0445\u0443 \u0431\u0440\u0430\u0442\u0441\u0442\u0432\u0430.", + "text": "Bosnian (Cyrillic) Đ¡Đ²Đ° Ñ™ÑƒĐ´ÑĐºĐ° Đ±Đ¸Ñ›Đ° Ñ€Đ°Ñ›Đ°Ñ˜Ñƒ Ñе ÑĐ»Đ¾Đ±Đ¾Đ´Đ½Đ° и Ñ˜ĐµĐ´Đ½Đ°ĐºĐ° у Đ´Đ¾ÑÑ‚Đ¾Ñ˜Đ°Đ½ÑÑ‚Đ²Ñƒ и Đ¿Ñ€Đ°Đ²Đ¸Đ¼Đ°. ĐĐ½Đ° Ñу Đ¾Đ±Đ´Đ°Ñ€ĐµĐ½Đ° Ñ€Đ°Đ·ÑƒĐ¼Đ¾Đ¼ и ÑĐ²Đ¸Ñ˜ĐµÑˆÑ›Ñƒ и Ñ‚Ñ€ĐµĐ±Đ° да Ñ˜ĐµĐ´Đ½Đ¾ Đ¿Ñ€ĐµĐ¼Đ° Đ´Ñ€ÑƒĐ³Đ¾Đ¼Đµ Đ¿Đ¾ÑÑ‚ÑƒĐ¿Đ°Ñ˜Ñƒ у Đ´ÑƒÑ…Ñƒ Đ±Ñ€Đ°Ñ‚ÑÑ‚Đ²Đ°.", "metadata": { "languages": [ "mkd" @@ -1256,7 +1256,7 @@ { "type": "NarrativeText", "element_id": "8918cf337af35db75c0b7e3a98572814", - "text": "Bosnian (Latin) Sva ljudska bi\u0107a ra\u0111aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i svije\u0161\u0107u i treba da jedno prema drugome postupaju u duhu bratstva.", + "text": "Bosnian (Latin) Sva ljudska bića raÄ‘aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sviješću i treba da jedno prema drugome postupaju u duhu bratstva.", "metadata": { "languages": [ "hrv" @@ -1277,7 +1277,7 @@ { "type": "NarrativeText", "element_id": "4f74a58266d23d68a787e2a91434a97d", - "text": "Breton Dieub ha par en o dellezegezh hag o gwirio\u00f9 eo ganet an holl dud. Poell ha skiant zo dezho ha dleout a reont beva\u00f1 an eil gant egile en ur spered a genvreudeuriezh.", + "text": "Breton Dieub ha par en o dellezegezh hag o gwirioĂ¹ eo ganet an holl dud. Poell ha skiant zo dezho ha dleout a reont bevañ an eil gant egile en ur spered a genvreudeuriezh.", "metadata": { "languages": [ "nld", @@ -1321,7 +1321,7 @@ { "type": "NarrativeText", "element_id": "24a3cf3bd02d17e2f2b065bab51c8e70", - "text": "Bulgarian \u0412\u0441\u0438\u0447\u043a\u0438 \u0445\u043e\u0440\u0430 \u0441\u0435 \u0440\u0430\u0436\u0434\u0430\u0442 \u0441\u0432\u043e\u0431\u043e\u0434\u043d\u0438 \u0438 \u0440\u0430\u0432\u043d\u0438 \u043f\u043e \u0434\u043e\u0441\u0442\u043e\u0439\u043d\u0441\u0442\u0432\u043e \u0438 \u043f\u0440\u0430\u0432\u0430. \u0422\u0435 \u0441\u0430 \u043d\u0430\u0434\u0430\u0440\u0435\u043d\u0438 \u0441 \u0440\u0430\u0437\u0443\u043c \u0438 \u0441\u044a\u0432\u0435\u0441\u0442 \u0438 \u0441\u043b\u0435\u0434\u0432\u0430 \u0434\u0430 \u0441\u0435 \u043e\u0442\u043d\u0430\u0441\u044f\u0442 \u043f\u043e\u043c\u0435\u0436\u0434\u0443 \u0441\u0438 \u0432 \u0434\u0443\u0445 \u043d\u0430 \u0431\u0440\u0430\u0442\u0441\u0442\u0432\u043e.", + "text": "Bulgarian Đ’ÑĐ¸Ñ‡ĐºĐ¸ Ñ…Đ¾Ñ€Đ° Ñе Ñ€Đ°Đ¶Đ´Đ°Ñ‚ ÑĐ²Đ¾Đ±Đ¾Đ´Đ½Đ¸ и Ñ€Đ°Đ²Đ½Đ¸ Đ¿Đ¾ Đ´Đ¾ÑÑ‚Đ¾Đ¹Đ½ÑÑ‚Đ²Đ¾ и Đ¿Ñ€Đ°Đ²Đ°. Đ¢Đµ Ñа Đ½Đ°Đ´Đ°Ñ€ĐµĐ½Đ¸ Ñ Ñ€Đ°Đ·ÑƒĐ¼ и ÑÑĐ²ĐµÑÑ‚ и ÑĐ»ĐµĐ´Đ²Đ° да Ñе Đ¾Ñ‚Đ½Đ°ÑÑÑ‚ Đ¿Đ¾Đ¼ĐµĐ¶Đ´Ñƒ Ñи Đ² Đ´ÑƒÑ… Đ½Đ° Đ±Ñ€Đ°Ñ‚ÑÑ‚Đ²Đ¾.", "metadata": { "languages": [ "bul" @@ -1342,7 +1342,7 @@ { "type": "NarrativeText", "element_id": "61589cb2ca0346e6af7f49a73b4125b3", - "text": "Bulu Abiali bod bese, tege ai sesala, bene etie dzia a mis memvende y'enyi\u00f1, dzom dzia etu fili nk\u00f3b\u00f3, fili ntsogan, fili mboan. Ve abiali te, mod ose ayem dze ene abe, dze ene mbe\u00f1 asu e mod mbog antoa ai mfi na enyi\u00f1 ewulu mezen mene sosoo.", + "text": "Bulu Abiali bod bese, tege ai sesala, bene etie dzia a mis memvende y'enyiñ, dzom dzia etu fili nkĂ³bĂ³, fili ntsogan, fili mboan. Ve abiali te, mod ose ayem dze ene abe, dze ene mbeñ asu e mod mbog antoa ai mfi na enyiñ ewulu mezen mene sosoo.", "metadata": { "languages": [ "ron", @@ -1366,7 +1366,7 @@ { "type": "UncategorizedText", "element_id": "6dbacafdbc68b6ba0689b2d27b2ede49", - "text": "Burmese \u101c\u1030\u1010\u102d\u102f\u1004\u103a\u1038\u101e\u100a\u103a \u1010\u1030\u100a\u102e \u101c\u103d\u1010\u103a\u101c\u1015\u103a\u101e\u1031\u102c \u1002\u102f\u100f\u103a\u101e\u102d\u1000\u1039\u1001\u102c\u1016\u103c\u1004\u1037\u103a \u101c\u100a\u103a\u1038\u1000\u1031\u102c\u1004\u103a\u1038\u104a \u1010\u1030\u100a\u102e\u101c\u103d\u1010\u103a\u101c\u1015\u103a\u101e\u1031\u102c \u1021\u1001\u103d\u1004\u1037\u103a\u1021\u101b\u1031\u1038\u1019\u103b\u102c\u1038\u1016\u103c\u1004\u1037\u103a \u101c\u100a\u103a\u1038\u1000\u1031\u102c\u1004\u103a\u1038\u104a \u1019\u103d\u1031\u1038\u1016\u103d\u102c\u1038\u101c\u102c\u101e\u1030\u1019\u103b\u102c\u1038 \u1016\u103c\u1005\u103a\u101e\u100a\u103a\u104b \u1011\u102d\u102f\u101e\u1030\u1010\u102d\u102f\u1037\u104c \u1015\u102d\u102f\u1004\u103a\u1038\u1001\u103c\u102c\u1038 \u101d\u1031\u1016\u1014\u103a\u1010\u1010\u103a\u101e\u1031\u102c \u1009\u102c\u100f\u103a\u1014\u103e\u1004\u1037\u103a \u1000\u103b\u1004\u1037\u103a\u101d\u1010\u103a \u101e\u102d\u1010\u1010\u103a\u101e\u1031\u102c \u1005\u102d\u1010\u103a\u1010\u102d\u102f\u1037\u101b\u103e\u102d\u1000\u103c\u104d \u1011\u102d\u102f\u101e\u1030\u1010\u102d\u102f\u1037\u101e\u100a\u103a \u1021\u1001\u103b\u1004\u103a\u1038\u1001\u103b\u1004\u103a\u1038 \u1019\u1031\u1010\u1039\u1010\u102c\u1011\u102c\u1038\u104d \u1006\u1000\u103a\u1006\u1036\u1000\u103b\u1004\u1037\u103a\u101e\u102f\u1036\u1038\u101e\u1004\u1037\u103a\u104f\u104b", + "text": "Burmese လူá€á€­á€¯á€„်းá€á€á€º á€á€°á€á€® လွá€á€ºá€œá€•်á€á€±á€¬ ဂုá€á€ºá€á€­á€€á€¹á€á€¬á€–ြင့် လá€á€ºá€¸á€€á€±á€¬á€„်းá á€á€°á€á€®á€œá€½á€á€ºá€œá€•်á€á€±á€¬ အá€á€½á€„့်အရေးများဖြင့် လá€á€ºá€¸á€€á€±á€¬á€„်းá မွေးဖွားလာá€á€°á€™á€»á€¬á€¸ ဖြစ်á€á€á€ºá‹ ထိုá€á€°á€á€­á€¯á€·áŒ ပိုင်းá€á€¼á€¬á€¸ á€á€±á€–န်á€á€á€ºá€á€±á€¬ ဉာá€á€ºá€”ှင့် ကျင့်á€á€á€º á€á€­á€á€á€ºá€á€±á€¬ စိá€á€ºá€á€­á€¯á€·á€›á€¾á€­á€€á€¼á ထိုá€á€°á€á€­á€¯á€·á€á€á€º အá€á€»á€„်းá€á€»á€„်း မေá€á€¹á€á€¬á€‘ားá ဆက်ဆံကျင့်á€á€¯á€¶á€¸á€á€„့်áá‹", "metadata": { "filetype": "text/plain", "data_source": { @@ -1384,7 +1384,7 @@ { "type": "NarrativeText", "element_id": "7b5c1459fc45a2821c0d05cd98c1996f", - "text": "Bushi \u0181inadamu djabi nitirahinyi an-nafasi, reu bokeu mira\u014ba amin\u2019ni usheu ndreka haki. Reu teraka ndreka \u00e3kili ndreka hikima, amin\u2019ni zenyi, reu nikulazimu nisi twera\u014ba nin-fihava\u014ba reu sambi reu.", + "text": "Bushi Æinadamu djabi nitirahinyi an-nafasi, reu bokeu miraÅ‹a amin’ni usheu ndreka haki. Reu teraka ndreka Ă£kili ndreka hikima, amin’ni zenyi, reu nikulazimu nisi tweraÅ‹a nin-fihavaÅ‹a reu sambi reu.", "metadata": { "languages": [ "swa" @@ -1448,7 +1448,7 @@ { "type": "NarrativeText", "element_id": "296f3e08ce32c544b7ce3922abf32c6c", - "text": "Cashibo-Cacataibo Ui uni cara 'iti ic\u00eb axbi ca b\u00ebtsi unib\u00eb gobiernon\u00ebn isc\u00ebx s\u00ebn\u00ebn it\u00ed ic\u00ebn. Ui cara ain tita ain papa 'iaxa quixun sinanquinma ca gobiernon\u00ebn sinanc\u00ebx ax b\u00ebtsib\u00eb s\u00ebn\u00ebn 'ic\u00ebn. Camaxunbi ca sinanti 'unanin. Camaxunbi ca a\u00f1u \u00f1u ati cara as\u00e1bi 'ic\u00ebn, a\u00f1u \u00f1u 'ati cara 'aisama 'ic\u00eb quixun 'unanti 'ic\u00ebn. Usa 'ain ca camaxbi ain xuc\u00ebnb\u00eb 'ic\u00ebsaribiti nuiananti 'ic\u00ebn.", + "text": "Cashibo-Cacataibo Ui uni cara 'iti icĂ« axbi ca bĂ«tsi unibĂ« gobiernonĂ«n iscĂ«x sĂ«nĂ«n itĂ­ icĂ«n. Ui cara ain tita ain papa 'iaxa quixun sinanquinma ca gobiernonĂ«n sinancĂ«x ax bĂ«tsibĂ« sĂ«nĂ«n 'icĂ«n. Camaxunbi ca sinanti 'unanin. Camaxunbi ca añu ñu ati cara asĂ¡bi 'icĂ«n, añu ñu 'ati cara 'aisama 'icĂ« quixun 'unanti 'icĂ«n. Usa 'ain ca camaxbi ain xucĂ«nbĂ« 'icĂ«saribiti nuiananti 'icĂ«n.", "metadata": { "languages": [ "sqi", @@ -1492,7 +1492,7 @@ { "type": "NarrativeText", "element_id": "75c025da4f4c95d2f428dc459b739bef", - "text": "Catalan-Valencian-Balear Tots els \u00e9ssers humans neixen lliures i iguals en dignitat i en drets. S\u00f3n dotats de ra\u00f3 i de consci\u00e8ncia, i han de comportar-se fraternalment els uns amb els altres.", + "text": "Catalan-Valencian-Balear Tots els Ă©ssers humans neixen lliures i iguals en dignitat i en drets. SĂ³n dotats de raĂ³ i de consciència, i han de comportar-se fraternalment els uns amb els altres.", "metadata": { "languages": [ "cat" @@ -1534,7 +1534,7 @@ { "type": "NarrativeText", "element_id": "346a128271cb055071a9b9d4548d0488", - "text": "Chachi Naaju chachilla bain mu' chachilla bain na kayatu tichiba bulla jutyu naakendya'ba kenu deechu taa na kayamu deju, tsenminya,naaju ju\u00f1u bain ne tsaave ti', uukavinu jutyu naa, tideechu juuchi bain, mubain mubain tsaren dejuve, tsenmin shilli pensangenu pude deju'. mitya, tsenr)1in ura' kendu bain ura' kendyu' bain mide' mitya muba mu bain veta' veta' ura' keewaawaa kenuu dejuve.", + "text": "Chachi Naaju chachilla bain mu' chachilla bain na kayatu tichiba bulla jutyu naakendya'ba kenu deechu taa na kayamu deju, tsenminya,naaju juñu bain ne tsaave ti', uukavinu jutyu naa, tideechu juuchi bain, mubain mubain tsaren dejuve, tsenmin shilli pensangenu pude deju'. mitya, tsenr)1in ura' kendu bain ura' kendyu' bain mide' mitya muba mu bain veta' veta' ura' keewaawaa kenuu dejuve.", "metadata": { "languages": [ "ind", @@ -1556,7 +1556,7 @@ { "type": "UncategorizedText", "element_id": "0b1ae7cf56e3557ef9acecc99806172b", - "text": "Chakma \ud804\udd1d\ud804\udd2c\ud804\udd07\ud804\udd34 \ud804\udd1f\ud804\udd1a\ud804\udd2a\ud804\udd0c\ud804\udd34 \ud804\udd1a\ud804\udd28\ud804\udd22\ud804\udd28\ud804\udd1e\ud804\udd28\ud804\udd23\ud804\udd28 \ud804\udd25\ud804\udd27\ud804\udd01 \ud804\udd03\ud804\udd28\ud804\udd0c\ud804\udd34\ud804\udd0e\ud804\udd2e\ud804\udd16\ud804\udd34 \ud804\udd03\ud804\udd33\ud804\udd03 \ud804\udd03\ud804\udd07\ud804\udd34\ud804\udd07\ud804\udd25\ud804\udd01 \ud804\udd1a\ud804\udd28\ud804\udd1a\ud804\udd2c\ud804\udd2d \ud804\udd0e\ud804\udd27\ud804\udd1a\ud804\udd34\ud804\udd1f\ud804\udd1a\ud804\udd34\ud804\udd41 \ud804\udd16\ud804\udd22\ud804\udd22\ud804\udd34 \ud804\udd03\ud804\udd2c\ud804\udd18 \ud804\udd03\ud804\udd33\ud804\udd03 \ud804\udd1d\ud804\udd2a\ud804\udd16\ud804\udd34\ud804\udd19\ud804\udd28 \ud804\udd03\ud804\udd0a\ud804\udd2c; \ud804\udd25\ud804\udd2c\ud804\udd1a\ud804\udd27\ud804\udd16\ud804\udd33\ud804\udd20\ud804\udd34 \ud804\udd1d\ud804\udd2c\ud804\udd07\ud804\udd34\ud804\udd05\ud804\udd1a\ud804\udd27\ud804\udd22\ud804\udd34 \ud804\udd03\ud804\udd2c\ud804\udd07\ud804\udd34\ud804\udd0e\ud804\udd27\ud804\udd1a\ud804\udd34 \ud804\udd03\ud804\udd22\ud804\udd2c\ud804\udd07\ud804\udd34 \ud804\udd0e\ud804\udd27\ud804\udd1a\ud804\udd27\ud804\udd22\ud804\udd34 \ud804\udd1b\ud804\udd33\ud804\udd22\ud804\udd27\ud804\udd16\ud804\udd28 \ud804\udd09\ud804\udd27\ud804\udd1f\ud804\udd34 \ud804\udd18\ud804\udd2e\ud804\udd23\ud804\udd34 \ud804\udd0c\ud804\udd28\ud804\udd18\ud804\udd33\ud804\udd20\ud804\udd2c \ud804\udd1a\ud804\udd28\ud804\udd1a\ud804\udd2c\ud804\udd2d \ud804\udd0c\ud804\udd27\ud804\udd23\ud804\udd1a \ud804\udd05\ud804\udd2a\ud804\udd0c\ud804\udd28\ud804\udd16\ud804\udd34\ud804\udd41", + "text": "Chakma đ‘„𑄬𑄇𑄴 đ‘„Ÿđ‘„đ‘„ªđ‘„Œđ‘„´ đ‘„đ‘„¨đ‘„¢đ‘„¨đ‘„đ‘„¨đ‘„£đ‘„¨ đ‘„¥đ‘„§đ‘„ đ‘„ƒđ‘„¨đ‘„Œđ‘„´đ‘„𑄮𑄖𑄴 đ‘„ƒđ‘„³đ‘„ƒ đ‘„ƒđ‘„‡đ‘„´đ‘„‡đ‘„¥đ‘„ đ‘„𑄨đ‘„𑄬𑄭 đ‘„đ‘„§đ‘„đ‘„´đ‘„Ÿđ‘„đ‘„´đ‘… đ‘„–đ‘„¢đ‘„¢đ‘„´ đ‘„ƒđ‘„¬đ‘„˜ đ‘„ƒđ‘„³đ‘„ƒ đ‘„đ‘„ªđ‘„–đ‘„´đ‘„™đ‘„¨ đ‘„ƒđ‘„𑄬; đ‘„¥đ‘„¬đ‘„đ‘„§đ‘„–đ‘„³đ‘„ đ‘„´ đ‘„𑄬𑄇𑄴𑄅đ‘„đ‘„§đ‘„¢đ‘„´ đ‘„ƒđ‘„¬đ‘„‡đ‘„´đ‘„đ‘„§đ‘„đ‘„´ đ‘„ƒđ‘„¢đ‘„¬đ‘„‡đ‘„´ đ‘„đ‘„§đ‘„đ‘„§đ‘„¢đ‘„´ đ‘„›đ‘„³đ‘„¢đ‘„§đ‘„–đ‘„¨ đ‘„‰đ‘„§đ‘„Ÿđ‘„´ đ‘„˜đ‘„®đ‘„£đ‘„´ đ‘„Œđ‘„¨đ‘„˜đ‘„³đ‘„ đ‘„¬ đ‘„𑄨đ‘„𑄬𑄭 đ‘„Œđ‘„§đ‘„£đ‘„ đ‘„…đ‘„ªđ‘„Œđ‘„¨đ‘„–đ‘„´đ‘…", "metadata": { "filetype": "text/plain", "data_source": { @@ -1596,7 +1596,7 @@ { "type": "NarrativeText", "element_id": "87e7fb3e75a3a124c8e4bce8573a5dd1", - "text": "Chayahuita Ya'ipi piyapinpoa' capini noya ninosorocaso' ya'hu\u00ebrin. Ya'ipinpoa' yonquir\u00ebhua'. Noya nicacaso' nitot\u00ebr\u00ebhua'. Napoaton iyanpoa pochin ninosorocaso' ya 'hu\u00ebrin.", + "text": "Chayahuita Ya'ipi piyapinpoa' capini noya ninosorocaso' ya'huĂ«rin. Ya'ipinpoa' yonquirĂ«hua'. Noya nicacaso' nitotĂ«rĂ«hua'. Napoaton iyanpoa pochin ninosorocaso' ya 'huĂ«rin.", "metadata": { "languages": [ "tgl", @@ -1619,7 +1619,7 @@ { "type": "NarrativeText", "element_id": "03ea2a4dd341c6cdd4c3ddd814721290", - "text": "Cherokee (cased) \u13c2\uab76\uaba3 \uab70\uab92\u13fc\uabbb \uab74\uab8e\uaba5\uab95\uab72 \uab74\uab8e\uabaa\uaba3\uab84\uaba3 \uab70\uab84 \uab71\uabb7\uab83\uab7d\uab99 \uab8e\uab72 \uab70\uabb2\uab99\uaba9\uaba7 \uab70\uab84 \uab74\uab92\uab82 \uab72\u13fb\uab8e\uabab\uaba7\uab72. \u13be\uab9d\uab79\uab8e\uab93 \uab74\uab85\uab9d\uab7a\uab88\uaba4\uab95\uab79 \uab74\uabb0\uabbf\uab9d\uaba7 \uab95\u13f8\uab85\uabab\uab79 \uab70\uab84 \uab70\uaba3\uab95\uaba6\uabaf\uaba3\uab9d\uaba7 \uab70\uab84 \uab71\uab85\uab9d\uaba7 \uab9f\u13fc\uabbb\uab7d \uab92\uabaa\uab8e\uaba3\uabab\uab8e\uaba5\uab7c\uab79 \uab8e \uaba7\uab8e\uaba3\uab95\uabaf \uab70\uaba3\uab95\uaba9 \uab7c\uaba7.", + "text": "Cherokee (cased) á‚ꭶꮣ ê­°ê®’á¼ê®» ê­´ê®ê®¥ê®•ê­² ê­´ê®ê®ªê®£ê®„ꮣ ꭰꮄ ꭱꮷꮃꭽꮙ ê®ê­² ꭰꮲꮙꮩꮧ ꭰꮄ ꭴꮒꮂ ê­²á»ê®ê®«ê®§ê­². á¾ê®ê­¹ê®ê®“ ê­´ê®…ê®ê­ºê®ˆê®¤ê®•ê­¹ ꭴꮰꮿê®ê®§ ꮕá¸ê®…ꮫꭹ ꭰꮄ ꭰꮣꮕꮦꮯꮣê®ê®§ ꭰꮄ ꭱꮅê®ê®§ ꮟá¼ê®»ê­½ ꮒꮪê®ê®£ê®«ê®ê®¥ê­¼ê­¹ ê® ê®§ê®ê®£ê®•ꮯ ꭰꮣꮕꮩ ꭼꮧ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -1637,7 +1637,7 @@ { "type": "NarrativeText", "element_id": "09009508dba31db1f130bf24d409614e", - "text": "Cherokee (uppercase) \u13c2\u13a6\u13d3 \u13a0\u13c2\u13f4\u13eb \u13a4\u13be\u13d5\u13c5\u13a2 \u13a4\u13be\u13da\u13d3\u13b4\u13d3 \u13a0\u13b4 \u13a1\u13e7\u13b3\u13ad\u13c9 \u13be\u13a2 \u13a0\u13e2\u13c9\u13d9\u13d7 \u13a0\u13b4 \u13a4\u13c2\u13b2 \u13a2\u13f3\u13be\u13db\u13d7\u13a2. \u13be\u13cd\u13a9\u13be\u13c3 \u13a4\u13b5\u13cd\u13aa\u13b8\u13d4\u13c5\u13a9 \u13a4\u13e0\u13ef\u13cd\u13d7 \u13c5\u13f0\u13b5\u13db\u13a9 \u13a0\u13b4 \u13a0\u13d3\u13c5\u13d6\u13df\u13d3\u13cd\u13d7 \u13a0\u13b4 \u13a1\u13b5\u13cd\u13d7 \u13cf\u13f4\u13eb\u13ad \u13c2\u13da\u13be\u13d3\u13db\u13be\u13d5\u13ac\u13a9 \u13be \u13d7\u13be\u13d3\u13c5\u13df \u13a0\u13d3\u13c5\u13d9 \u13ac\u13d7.", + "text": "Cherokee (uppercase) á‚á¦á“ á á‚á´á« á¤á¾á•á…ᢠá¤á¾áá“á´á“ á á´ á¡á§á³á­á‰ á¾á¢ á á¢á‰á™á— á á´ á¤á‚á² á¢á³á¾á›á—á¢. á¾áá©á¾áƒ á¤áµááªá¸á”á…á© á¤á á¯áá— á…á°áµá›á© á á´ á á“á…á–áŸá“áá— á á´ á¡áµáá— áá´á«á­ á‚áá¾á“á›á¾á•á¬á© á¾ á—á¾á“á…០á á“á…á™ á¬á—.", "metadata": { "filetype": "text/plain", "data_source": { @@ -1655,7 +1655,7 @@ { "type": "NarrativeText", "element_id": "ca845e694f20fb1947def444cd1f59f9", - "text": "Chickasaw Himmaka' nittakookano hattak yokasht toksalicha'nikat ki'yo. Hattak m\u00f3\u0331makat itt\u00edllawwi b\u00edyyi'kacha nanna m\u00f3\u0331maka\u0331 ittibaachaffa'hitok.", + "text": "Chickasaw Himmaka' nittakookano hattak yokasht toksalicha'nikat ki'yo. Hattak mĂ³̀±makat ittĂ­llawwi bĂ­yyi'kacha nanna mĂ³̀±makà± ittibaachaffa'hitok.", "metadata": { "languages": [ "swa", @@ -1720,7 +1720,7 @@ { "type": "NarrativeText", "element_id": "2dc80f80340d36e85a551642585e592a", - "text": "Chin, Matu Thlangboeih he rhimomna, vanpitna, yalpona hamhmoel ka tawn thlang la cuun la ng\u2019om u. Thlanghing he athae-then paekboe thaina neh yakming thaina moeiboe ka tawn thlang la n\u2019om u dong ah khat neh khat lungvat na neh thloehlan voekhlak u thae ham om.", + "text": "Chin, Matu Thlangboeih he rhimomna, vanpitna, yalpona hamhmoel ka tawn thlang la cuun la ng’om u. Thlanghing he athae-then paekboe thaina neh yakming thaina moeiboe ka tawn thlang la n’om u dong ah khat neh khat lungvat na neh thloehlan voekhlak u thae ham om.", "metadata": { "languages": [ "tgl", @@ -1763,7 +1763,7 @@ { "type": "NarrativeText", "element_id": "66e7bb8d8db209646cecea79ecf23f89", - "text": "Chinantec, Chiltepec Lej\u0268\u0308 ni sou tsa lisia\u0331 ija\u0331a sia ikou' ne kojo\u0331 j\u00ef ne juso\u0331 ne jmo' re ju i s\u0268' jmo' n\u00f6 sala\u0331 ne sasno.", + "text": "Chinantec, Chiltepec Lejɨ̀ˆ ni sou tsa lisià± ijà±a sia ikou' ne kojò± jĂ¯ ne jusò± ne jmo' re ju i sɨ' jmo' nö salà± ne sasno.", "metadata": { "languages": [ "hrv" @@ -1784,7 +1784,7 @@ { "type": "NarrativeText", "element_id": "b29e38dc8292efa10880271bbb145f07", - "text": "Chinantec, Ojitl\u00e1n La juu dsa lu si\u00e4 \u2013Dsa k\u00f6 \u00f1i ba dsa, n\u00eda k\u00f6 ni' ba na lu' dsa e dsa t\u00ef \u00e9 li jnia' ro\u00f6'.", + "text": "Chinantec, OjitlĂ¡n La juu dsa lu siä –Dsa kö ñi ba dsa, nĂ­a kö ni' ba na lu' dsa e dsa tĂ¯ Ă© li jnia' roö'.", "metadata": { "languages": [ "fin", @@ -1827,7 +1827,7 @@ { "type": "Title", "element_id": "be604439089a8fedd5abdc4d81187599", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5fd7\u5411\u8ddf\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u6e20\u4eec\u8d4b\u6709\u7406\u6027\u8ddf\u826f\u5fc3\uff0c\u5e76\u7406\u5f53\u4ee5\u5f1f\u5144\u4e49\u6c14\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在志å‘è·Ÿæƒåˆ©ä¸ä¸€å¾‹å¹³ç­‰ă€‚æ¸ ä»¬èµ‹æœ‰ç†æ€§è·Ÿè‰¯å¿ƒï¼Œå¹¶ç†å½“ä»¥å¼Ÿå…„ä¹‰æ°”ç›¸å¯¹å¾…ă€‚", "metadata": { "languages": [ "zho" @@ -1869,7 +1869,7 @@ { "type": "Title", "element_id": "05e53430ff030465078e511efc0de0b2", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u540c\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4f62\u4e01\u4eba\u8d4b\u6709\u7406\u6027\u540c\u597d\u5fc3\u7530\uff0c\u5e76\u5e94\u4ee5\u5144\u5f1f\u5173\u7cfb\u4e2a\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在å°ä¸¥åŒæƒåˆ©ä¸ä¸€å¾‹å¹³ç­‰ă€‚ä½¢ä¸äººèµ‹æœ‰ç†æ€§åŒå¥½å¿ƒç”°ï¼Œå¹¶åº”以兄弟关系个精ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho", @@ -1912,7 +1912,7 @@ { "type": "Title", "element_id": "549cb1628fe3e0cafb78cd92f08f0554", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u5f1f\u5144\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在å°ä¸¥å’Œæƒåˆ©ä¸ä¸€å¾‹å¹³ç­‰ă€‚ä»–ä»¬èµ‹æœ‰ç†æ€§å’Œè‰¯å¿ƒï¼Œå¹¶åº”以弟兄关系ç„ç²¾ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho", @@ -1955,7 +1955,7 @@ { "type": "Title", "element_id": "bf0df306ed131c2adf4243ded3865e6a", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u6328\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u522c\u5e73\u7b49\u3002\u4ed6\u4eec\u8d81\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u4e00\u4e2a\u5ea7\u513f\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,挨å°ä¸¥å’Œæƒåˆ©ä¸ä¸€åˆ¬å¹³ç­‰ă€‚他们è¶ç†æ€§å’Œè‰¯å¿ƒï¼Œå¹¶åº”以一个座儿ç„ç²¾ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho", @@ -1998,7 +1998,7 @@ { "type": "Title", "element_id": "ba1e57780fc9d286c63be7e8e73e3c2e", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u5f8b\u662f\u5e73\u7b49\u7684\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u4e92\u5bf9\u5f85\u3002", + "text": "人人生而自由,在å°ä¸¥å’Œæƒåˆ©ä¸ä¸€å¾‹æ˜¯å¹³ç­‰ç„ă€‚ä»–ä»¬èµ‹æœ‰ç†æ€§å’Œè‰¯å¿ƒï¼Œå¹¶åº”以兄弟关系ç„ç²¾ç¥ç›¸äº’å¯¹å¾…ă€‚", "metadata": { "languages": [ "zho" @@ -2040,7 +2040,7 @@ { "type": "Title", "element_id": "bdf44eafec897495cf404ac895e41ee3", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e4b\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u54e5\u4eec\u5f1f\u5144\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在å°ä¸¥å’Œæƒåˆ©ä¹‹ä¸ä¸€å¾‹å¹³ç­‰ă€‚ä»–ä»¬èµ‹æœ‰ç†æ€§å’Œè‰¯å¿ƒï¼Œå¹¶åº”以哥们弟兄ç„ç²¾ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho" @@ -2082,7 +2082,7 @@ { "type": "Title", "element_id": "a96206ba057e6ac6c0fdb4c87d21a1c9", - "text": "\u5927\u5bb6\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u544a\u6743\u5229\u4e0a\u5934\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u544a\u826f\u5fc3\uff0c\u5e76\u8be5\u6d3e\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "大家生而自由,在å°ä¸¥å‘æƒåˆ©ä¸å¤´ä¸€å¾‹å¹³ç­‰ă€‚ä»–ä»¬èµ‹æœ‰ç†æ€§å‘良心,并该派以兄弟关系ç„ç²¾ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho", @@ -2125,7 +2125,7 @@ { "type": "Title", "element_id": "c185fc727614ade15888d1e8c9a00c4d", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531,\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3,\u5e76\u5e94\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在å°ä¸¥å’Œæƒåˆ©ä¸ä¸€å¾‹å¹³ç­‰ă€‚ä»–ä»¬èµ‹æœ‰ç†æ€§å’Œè‰¯å¿ƒ,并应以兄弟关系ç„ç²¾ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho", @@ -2168,7 +2168,7 @@ { "type": "Title", "element_id": "9e8a7703ae5139a2870b236cfa54cfd6", - "text": "\u4eba\u4e2a\u9876\u4e2a\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u822c\u513f\u822c\u513f\u5927\u3002\u4ed6\u4eec\u8d81\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人个顶个生而自由,在å°ä¸¥å’Œæƒåˆ©ä¸èˆ¬å„¿èˆ¬å„¿å¤§ă€‚他们è¶ç†æ€§å’Œè‰¯å¿ƒï¼Œå¹¶åº”以兄弟关系ç„ç²¾ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho" @@ -2210,7 +2210,7 @@ { "type": "Title", "element_id": "0e1d6539c2001d2ba8e3188f43b83f7f", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u56b4\u548c\u6b0a\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u5011\u8ce6\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u4e26\u61c9\u4ee5\u5144\u5f1f\u95dc\u4fc2\u7684\u7cbe\u795e\u76f8\u5c0d\u5f85\u3002", + "text": "人人生而自由,在å°å´å’Œæ¬åˆ©ä¸ä¸€å¾‹å¹³ç­‰ă€‚ä»–å€‘è³¦æœ‰ç†æ€§å’Œè‰¯å¿ƒï¼Œä¸¦æ‡‰ä»¥å…„弟關係ç„ç²¾ç¥ç›¸å°å¾…。", "metadata": { "languages": [ "kor", @@ -2253,7 +2253,7 @@ { "type": "Title", "element_id": "48659e28c3b04b69caeaa16aded28f58", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u5408\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u56e0\u8d4b\u6709\u813e\u80c3\u5408\u9053\u884c\uff0c\u5e76\u7740\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在å°ä¸¥åˆæƒåˆ©ä¸ä¸€å¾‹å¹³ç­‰ă€‚因赋有脾胃åˆé“行,并ç€ä»¥å…„弟关系ç„ç²¾ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho", @@ -2296,7 +2296,7 @@ { "type": "Title", "element_id": "c8272c39e78f413c6902b423da92287d", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u62c9\u5c0a\u4e25\u8131\u4ed4\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4f0a\u62c9\u6709\u7406\u6027\u8131\u4ed4\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u5144\u5f1f\u5173\u7cfb\u4e2a\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,拉å°ä¸¥è„±ä»”æƒåˆ©ä¸ä¸€å¾‹å¹³ç­‰ă€‚伿‹‰æœ‰ç†æ€§è„±ä»”良心,并应以兄弟关系个精ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho", @@ -2339,7 +2339,7 @@ { "type": "Title", "element_id": "7d70d884e74db8b4302ba0589166c634", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5728\u5f97\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在å°ä¸¥å’Œæƒåˆ©ä¸ä¸€å¾‹å¹³ç­‰ă€‚ä»–ä»¬èµ‹æœ‰ç†æ€§å’Œè‰¯å¿ƒï¼Œåœ¨å¾—以兄弟关系ç„ç²¾ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "zho" @@ -2381,7 +2381,7 @@ { "type": "Title", "element_id": "932a20508f1be7b3c6fa54b0f9e46f14", - "text": "\u4eba\u4eba\u751f\u800c\u5e73\u7b49\uff0c\u55ba\u5c0a\u4e25\u540c\u57cb\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4f62\u54cb\u6709\u7406\u6027\u540c\u57cb\u826f\u5fc3\uff0c\u800c\u4e14\u5e94\u5f53\u4ee5\u5144\u5f1f\u5173\u7cfb\u5605\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而平等,喺å°ä¸¥åŒåŸ‹æƒåˆ©ä¸ä¸€å¾‹å¹³ç­‰ă€‚ä½¢å“‹æœ‰ç†æ€§åŒåŸ‹è‰¯å¿ƒï¼Œè€Œä¸”应当以兄弟关系嘅精ç¥ç›¸å¯¹å¾…。", "metadata": { "languages": [ "kor", @@ -2467,7 +2467,7 @@ { "type": "NarrativeText", "element_id": "93683f443b25a57d05bfb3b2ab1533a8", - "text": "Chuvash \u041f\u0443\u0440 \u0445\u0430\u043b\u04d1\u0445 \u0442\u0430 \u0443\u0439\u0440\u04d1\u043c \u043f\u0443\u0440\u04d1\u043d\u043c\u0430 \u043f\u04d7\u0440 \u0442\u0430\u043d \u043f\u0440\u0430\u0432\u0430\u043b\u043b\u04d1. \u04aa\u0430\u043a \u043f\u0440\u0430\u0432\u0430\u043f\u0430 \u0443\u0441\u04d1 \u043a\u0443\u0440\u0441\u0430 \u0432\u04d7\u0441\u0435\u043c \u0445\u04d1\u0439\u0441\u0435\u043d \u043f\u043e\u043b\u0438\u0442\u0438\u043a\u0430 \u0441\u0442\u0430\u0442\u0443\u0441\u043d\u0435 \u0438\u0440\u04d7\u043a\u043b\u04d7\u043d \u0442\u0443\u0441\u0430 \u0445\u0443\u0440\u0430\u04ab\u04ab\u04d7, \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430, \u043e\u0431\u0449\u0435\u0441\u0442\u0432\u043e \u0442\u0430\u0442\u0430 \u043a\u0443\u043b\u044c\u0442\u0443\u0440\u0430 \u0435\u043d\u04d7\u043f\u0435 \u0438\u0440\u04d7\u043a\u043b\u04d7\u043d \u0430\u0442\u0430\u043b\u0430\u043d\u0430\u04ab\u04ab\u04d7. \u041f\u0430\u0442\u0448\u0430\u043b\u04d1\u0445\u0441\u0435\u043d \u04ab\u0430\u043a \u043f\u0440\u0430\u0432\u04d1\u043d\u0430 \u0445\u0438\u0441\u0435\u043f\u043b\u0435\u043c\u0435\u043b\u043b\u0435, \u0442\u0435\u0440\u0440\u0438\u0442\u043e\u0440\u0438 \u043f\u04d7\u0440 \u043f\u04d7\u0442\u04d7\u043c\u043b\u04d7\u0445\u04d7\u043d \u043f\u0440\u0438\u043d\u0446\u0438\u043f\u04d7 \u0443\u043d\u043f\u0430 \u0443\u0441\u04d1 \u043a\u0443\u0440\u043c\u0430 \u043f\u04d7\u0440 \u0435\u043d\u043b\u04d7\u043d \u0447\u0430\u0440\u0441\u0430 \u0442\u04d1\u0440\u0430\u043a\u0430\u043d \u0447\u04d1\u0440\u043c\u0430\u0432 \u043f\u0443\u043b\u043c\u0430\u043b\u043b\u0430 \u043c\u0430\u0440.", + "text": "Chuvash ĐŸÑƒÑ€ Ñ…Đ°Đ»Ó‘Ñ… Ñ‚Đ° ÑƒĐ¹Ñ€Ó‘Đ¼ Đ¿ÑƒÑ€Ó‘Đ½Đ¼Đ° Đ¿Ó—Ñ€ Ñ‚Đ°Đ½ Đ¿Ñ€Đ°Đ²Đ°Đ»Đ»Ó‘. ̉ªĐ°Đº Đ¿Ñ€Đ°Đ²Đ°Đ¿Đ° уÑÓ‘ ĐºÑƒÑ€Ñа Đ²Ó—ÑĐµĐ¼ Ñ…Ó‘Đ¹ÑĐµĐ½ Đ¿Đ¾Đ»Đ¸Ñ‚Đ¸ĐºĐ° ÑÑ‚Đ°Ñ‚ÑƒÑĐ½Đµ Đ¸Ñ€Ó—ĐºĐ»Ó—Đ½ туÑа Ñ…ÑƒÑ€Đ°̉«̉«Ó—, ÑĐºĐ¾Đ½Đ¾Đ¼Đ¸ĐºĐ°, Đ¾Đ±Ñ‰ĐµÑÑ‚Đ²Đ¾ Ñ‚Đ°Ñ‚Đ° ĐºÑƒĐ»ÑŒÑ‚ÑƒÑ€Đ° ĐµĐ½Ó—Đ¿Đµ Đ¸Ñ€Ó—ĐºĐ»Ó—Đ½ Đ°Ñ‚Đ°Đ»Đ°Đ½Đ°̉«̉«Ó—. ĐŸĐ°Ñ‚ÑˆĐ°Đ»Ó‘Ñ…ÑĐµĐ½ ̉«Đ°Đº Đ¿Ñ€Đ°Đ²Ó‘Đ½Đ° Ñ…Đ¸ÑĐµĐ¿Đ»ĐµĐ¼ĐµĐ»Đ»Đµ, Ñ‚ĐµÑ€Ñ€Đ¸Ñ‚Đ¾Ñ€Đ¸ Đ¿Ó—Ñ€ Đ¿Ó—Ñ‚Ó—Đ¼Đ»Ó—Ñ…Ó—Đ½ Đ¿Ñ€Đ¸Đ½Ñ†Đ¸Đ¿Ó— ÑƒĐ½Đ¿Đ° уÑÓ‘ ĐºÑƒÑ€Đ¼Đ° Đ¿Ó—Ñ€ ĐµĐ½Đ»Ó—Đ½ Ñ‡Đ°Ñ€Ñа Ñ‚Ó‘Ñ€Đ°ĐºĐ°Đ½ Ñ‡Ó‘Ñ€Đ¼Đ°Đ² Đ¿ÑƒĐ»Đ¼Đ°Đ»Đ»Đ° Đ¼Đ°Ñ€.", "metadata": { "languages": [ "rus", @@ -2511,7 +2511,7 @@ { "type": "NarrativeText", "element_id": "7829c582fafb0be79ca15885a9ffe253", - "text": "Comorian, Maore Wanadamu piya udzalwa huru tsena sawa ha ufahari na ha haki. Na wawo wana \u00e3kili na hisi, esa ilazimu wadzivhinge na wanyao ha fikira ya unanya.", + "text": "Comorian, Maore Wanadamu piya udzalwa huru tsena sawa ha ufahari na ha haki. Na wawo wana Ă£kili na hisi, esa ilazimu wadzivhinge na wanyao ha fikira ya unanya.", "metadata": { "languages": [ "swa" @@ -2553,7 +2553,7 @@ { "type": "NarrativeText", "element_id": "8aea2ff9710269cb8bdfd811de62b8cd", - "text": "Corsican Nascinu tutti l\u2019omi libari \u00e8 pari di dignit\u00e0 \u00e8 di diritti. Pussedinu a raghjoni \u00e8 a cuscenza \u00e8 li tocca ad agiscia tr\u00e0 elli di modu fraternu.", + "text": "Corsican Nascinu tutti l’omi libari è pari di dignitĂ  è di diritti. Pussedinu a raghjoni è a cuscenza è li tocca ad agiscia trĂ  elli di modu fraternu.", "metadata": { "languages": [ "ita" @@ -2574,7 +2574,7 @@ { "type": "UncategorizedText", "element_id": "7174e554bd11372c5e339ba08b9881ab", - "text": "Cree, Swampy \u14a5\u14ef\u140c \u1403\u14c2\u14c2\u1424 \u144e\u142f\u14c2\u14a5\u144e\u14f1\u140e\u14c2\u1420 \u1401\u1511 \u14c2\u1455\u140e\u146d\u141f \u14c0\u1422\u1455 \u142f\u152d\u147e\u1423 \u146d\u148b \u1403\u1511 \u1472\u14c7\u1417\u1438\u14a5\u146f\u140e\u14ef\u141f \u146d\u1422\u144c\u14c2\u14a5\u144e\u14f1\u140e\u14c2\u1420 \u14c0\u1422\u1455 \u14a5\u14c2\u146f\u140e\u14ef\u140e\u14c7\u166e \u1401 \u1438\u146d\u144e\u14c7\u14aa\u148b\u1420 \u1472\u146b\u1455\u140c\u14c2\u1455\u14a7\u140e\u14c2\u14c2\u1424 \u14c0\u1422\u1455 \u14a5\u1450\u14c0\u14c2\u148b\u1472\u14c2\u14c2\u1424 \u14c0\u1422\u1455 \u140e\u148b\u1474\u14ef\u1450\u140e\u14c2\u1420 \u146d\u148b \u1403\u1511 \u1472\u14c7\u1417\u1438\u14a5\u1450\u148b\u1420\u166e", + "text": "Cree, Swampy ᒥᓯጠáƒá“‚ᓂᤠá‘á¯á“‚á’¥á‘ᓱáá“‚á  áᔑ á“‚á‘•áᑭ០ᓀá¢á‘• á¯á”­á‘¾á£ á‘­á’‹ áƒá”‘ ᑲᓇá—á¸á’¥á‘¯áᓯ០ᑭá¢á‘Œá“‚á’¥á‘ᓱáá“‚á  á“€á¢á‘• ᒥᓂᑯáᓯáᓇ᙮ á á¸á‘­á‘á“‡á’ªá’‹á  á‘²á‘«á‘•áŒá“‚á‘•á’§áᓂᓂᤠᓀá¢á‘• á’¥á‘ᓀᓂᒋᑲᓂᓂᤠᓀá¢á‘• áᒋᑴᓯá‘áá“‚á  á‘­á’‹ áƒá”‘ ᑲᓇá—á¸á’¥á‘á’‹á á™®", "metadata": { "filetype": "text/plain", "data_source": { @@ -2592,7 +2592,7 @@ { "type": "NarrativeText", "element_id": "952f38639569c0ef489cc6ebb4e809a7", - "text": "Crimean Tatar B\u00fct\u00fcn insanlar serbestlik, menlik ve uquqlarda musaviy ol\u0131p d\u00fcnya\u011fa keleler. Olar aq\u0131l ve vicdan saibidirler ve biri-birilerinen qarda\u015f\u00e7as\u0131na munasebette bulunmal\u0131d\u0131rlar", + "text": "Crimean Tatar BĂ¼tĂ¼n insanlar serbestlik, menlik ve uquqlarda musaviy olıp dĂ¼nyaÄŸa keleler. Olar aqıl ve vicdan saibidirler ve biri-birilerinen qardaÅŸĂ§asına munasebette bulunmalıdırlar", "metadata": { "languages": [ "tur" @@ -2613,7 +2613,7 @@ { "type": "NarrativeText", "element_id": "2ed33ba01de24e402f5963e9b2b56328", - "text": "Crioulo, Upper Guinea Tudu pekaduris ta padidu libri i igual na balur suma na diritus. Suma e dadu kapasidadi di pensa, e tene tambi konsiensia, e dibi di trata \u00f1utru suma ermons.", + "text": "Crioulo, Upper Guinea Tudu pekaduris ta padidu libri i igual na balur suma na diritus. Suma e dadu kapasidadi di pensa, e tene tambi konsiensia, e dibi di trata ñutru suma ermons.", "metadata": { "languages": [ "ind", @@ -2635,7 +2635,7 @@ { "type": "NarrativeText", "element_id": "8eb33fe9d9a2a68e6a146718f7b97d24", - "text": "Crioulo, Upper Guinea (008) Tudu pecadur padidu livre, ninguin ca m\u00e1s ninguin, tudu djusta, tudu tem mesmu diritu. Tudu quin qui padidu, tem si ro\u00e7on, cu si manera di pensa. Na metadi di utrus I d\u00edbidi fassi cussas cu ermondadi.", + "text": "Crioulo, Upper Guinea (008) Tudu pecadur padidu livre, ninguin ca mĂ¡s ninguin, tudu djusta, tudu tem mesmu diritu. Tudu quin qui padidu, tem si roçon, cu si manera di pensa. Na metadi di utrus I dĂ­bidi fassi cussas cu ermondadi.", "metadata": { "languages": [ "ita", @@ -2659,7 +2659,7 @@ { "type": "NarrativeText", "element_id": "9a87923b32ddc3eb20ab733920e58198", - "text": "Croatian Sva ljudska bi\u0107a ra\u0111aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i svije\u0161\u0107u i treba da jedno prema drugome postupaju u duhu bratstva.", + "text": "Croatian Sva ljudska bića raÄ‘aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sviješću i treba da jedno prema drugome postupaju u duhu bratstva.", "metadata": { "languages": [ "hrv" @@ -2680,7 +2680,7 @@ { "type": "NarrativeText", "element_id": "0666ab63ad7ac65ec7290cb18d27749d", - "text": "Czech V\u0161ichni lid\u00e9 rod\u00ed se svobodn\u00ed a sob\u011b rovn\u00ed co do d\u016fstojnosti a pr\u00e1v. Jsou nad\u00e1ni rozumem a sv\u011bdom\u00edm a maj\u00ed spolu jednat v duchu bratrstv\u00ed.", + "text": "Czech VÅ¡ichni lidĂ© rodĂ­ se svobodnĂ­ a sobÄ› rovnĂ­ co do důstojnosti a prĂ¡v. Jsou nadĂ¡ni rozumem a svÄ›domĂ­m a majĂ­ spolu jednat v duchu bratrstvĂ­.", "metadata": { "languages": [ "ces" @@ -2701,7 +2701,7 @@ { "type": "NarrativeText", "element_id": "cb7b177025447a197e5f95166eeb0282", - "text": "Dagaare, Southern Nengsaala zaa ba nang d\u0254ge so la o menga, ka o ne o taaba zaa sengtaa noba emmo ane y\u025bl\u025bsoobo sobic po\u0254. Ba d\u0254g\u025b\u025b ba zaa ne y\u025bng ane y\u025bl\u025b-iruu k'a da seng ka ba er\u025b y\u025bl\u025b kor\u0254 taa a nga y\u0254\u0254mine.", + "text": "Dagaare, Southern Nengsaala zaa ba nang dÉ”ge so la o menga, ka o ne o taaba zaa sengtaa noba emmo ane yÉ›lÉ›soobo sobic poÉ”. Ba dÉ”gɛɛ ba zaa ne yÉ›ng ane yÉ›lÉ›-iruu k'a da seng ka ba erÉ› yÉ›lÉ› korÉ” taa a nga yɔɔmine.", "metadata": { "languages": [ "tgl", @@ -2723,7 +2723,7 @@ { "type": "NarrativeText", "element_id": "8e66c9e0bff4a344e85d8767b43fd67a", - "text": "Dagbani Sal' la sala. B\u025bhig' be sokam sanimi, din pa la amii. Suhiz\u0254bo be sokam sani; ka namb\u0254\u0263u beni. Suhub\u0254hibo mi bi lan k\u0254\u014b yigunaadam kam sani. Dinzu\u0263u dimb\u0254\u014b\u0254 zaa wuhiya ka dama di tu kamaata ka ti zaa yu tab' hali ni ti puuni.", + "text": "Dagbani Sal' la sala. BÉ›hig' be sokam sanimi, din pa la amii. SuhizÉ”bo be sokam sani; ka nambɔɣu beni. SuhubÉ”hibo mi bi lan kɔŋ yigunaadam kam sani. DinzuÉ£u dimbɔŋɔ zaa wuhiya ka dama di tu kamaata ka ti zaa yu tab' hali ni ti puuni.", "metadata": { "languages": [ "swa", @@ -2746,7 +2746,7 @@ { "type": "NarrativeText", "element_id": "b90d9e9d9c05b4f6982b37bbe3c37e9f", - "text": "Dangme Adesahi tsuo \u0254, a b\u0254 m\u025b n\u025b n\u0254 f\u025b\u025b n\u0254 e ye e he, n\u025b n\u0254 tsuaa n\u0254s\u0254 ng\u025b odehe si himi k\u025b he bl\u0254hi a bl\u0254 fa mi. A b\u0254 m\u025b k\u025b n\u0254\u0301 se k\u0254mi k\u025b he nule ju\u025bmi, n\u025b e hia kaa n\u0254 f\u025b\u025b n\u0254 n\u025b e na ny\u025bmi su\u0254mi k\u025b ha n\u0254 tsuaa n\u0254.", + "text": "Dangme Adesahi tsuo É”, a bÉ” mÉ› nÉ› nÉ” fɛɛ nÉ” e ye e he, nÉ› nÉ” tsuaa nÉ”sÉ” ngÉ› odehe si himi kÉ› he blÉ”hi a blÉ” fa mi. A bÉ” mÉ› kÉ› nÉ”̀ se kÉ”mi kÉ› he nule juÉ›mi, nÉ› e hia kaa nÉ” fɛɛ nÉ” nÉ› e na nyÉ›mi suÉ”mi kÉ› ha nÉ” tsuaa nÉ”.", "metadata": { "languages": [ "sqi", @@ -2769,7 +2769,7 @@ { "type": "NarrativeText", "element_id": "334d7844545ea360de232426f24cc228", - "text": "Danish Alle mennesker er f\u00f8dt frie og lige i v\u00e6rdighed og rettigheder. De er udstyret med fornuft og samvittighed, og de b\u00f8r handle mod hverandre i en broderskabets \u00e5nd.", + "text": "Danish Alle mennesker er født frie og lige i værdighed og rettigheder. De er udstyret med fornuft og samvittighed, og de bør handle mod hverandre i en broderskabets Ă¥nd.", "metadata": { "languages": [ "dan" @@ -2790,7 +2790,7 @@ { "type": "NarrativeText", "element_id": "12deb838666ab6083a3dba9696b9fba1", - "text": "Dari \u062a\u0645\u0627\u0645 \u0627\u0641\u0631\u0627\u062f \u0628\u0634\u0631 \u0622\u0632\u0627\u062f \u0628\u0647 \u062f\u0646\u06cc\u0627 \u0645\u06cc\u200c\u0622\u06cc\u0646\u062f \u0648 \u0627\u0632 \u0644\u062d\u0627\u0638 \u062d\u06cc\u062b\u06cc\u062a \u0648 \u062d\u0642\u0648\u0642 \u0628\u0627 \u0647\u0645 \u0628\u0631\u0627\u0628\u0631\u0646\u062f. \u0647\u0645\u0647 \u062f\u0627\u0631\u0627\u06cc \u0639\u0642\u0644 \u0648 \u0648\u062c\u062f\u0627\u0646 \u0647\u0633\u062a\u0646\u062f \u0648 \u0628\u0627\u06cc\u062f \u0646\u0633\u0628\u062a \u0628\u0647 \u06cc\u06a9\u062f\u06cc\u06af\u0631 \u0628\u0627 \u0631\u0648\u062d \u0628\u0631\u0627\u062f\u0631\u06cc \u0631\u0641\u062a\u0627\u0631 \u06a9\u0646\u0646\u062f.", + "text": "Dari تمام Ø§ÙØ±Ø§Ø¯ بشر آزاد به دنیا می‌آیند Ùˆ از لحاظ حیثیت Ùˆ حقوق با هم برابرند. همه دارای عقل Ùˆ وجدان هستند Ùˆ باید نسبت به یکدیگر با روح برادری Ø±ÙØªØ§Ø± کنند.", "metadata": { "languages": [ "fas" @@ -2811,7 +2811,7 @@ { "type": "NarrativeText", "element_id": "3551715d069482f6ec4dba0cd2418882", - "text": "Dendi Aduniya kuna n gu ibuna damayo h\u025bi n\u0254 dei-dei nn daama nna n burucinit\u025br\u025b f\u0254, n lasabu nna laakari ya nam nn m\u0254 huro c\u025br\u025b kuna nyanze t\u025br\u025b b\u0254\u014b\u0254\u0254.", + "text": "Dendi Aduniya kuna n gu ibuna damayo hÉ›i nÉ” dei-dei nn daama nna n burucinitÉ›rÉ› fÉ”, n lasabu nna laakari ya nam nn mÉ” huro cÉ›rÉ› kuna nyanze tÉ›rÉ› bɔŋɔɔ.", "metadata": { "languages": [ "swa", @@ -2834,7 +2834,7 @@ { "type": "NarrativeText", "element_id": "ac128efe598097cdb68a483b1ea1f22c", - "text": "Dinka, Northeastern Raan th\u00f6k eben aye dh\u00eb\u00ebth ka lau nh\u00f6m kua th\u00f6\u014b nhiim eyithiic, kua th\u025b\u0308kic, kua ci y\u00ebknhiethku puou, ku bik c\u00eb\u014b ka ke ye mith etik.", + "text": "Dinka, Northeastern Raan thök eben aye dhëëth ka lau nhöm kua thĂ¶Å‹ nhiim eyithiic, kua thÉ›̀ˆkic, kua ci yĂ«knhiethku puou, ku bik cĂ«Å‹ ka ke ye mith etik.", "metadata": { "languages": [ "sqi", @@ -2856,7 +2856,7 @@ { "type": "NarrativeText", "element_id": "377f3dff94511f4733f9a8fa47685f8a", - "text": "Ditammari Oniti ti p\u025bi n\u0256\u025b om\u0254\u0169 yi kpaatri ot\u0254u, k\u025b y\u025b\u0303 oniti ba we, o yi \u0256o nn\u025b f\u025bh\u0254\u0303f\u025b; o m\u0254k\u025bmu m\u025bcii k\u025bh\u00e3 m\u025by\u025bmm\u025b. Ti t\u00fa n\u025b \u0256o kenyari ti t\u0254b\u025b mb\u025b k\u025b yie mii ba nkwu\u0254 ko ot\u0254u \u0256au.", + "text": "Ditammari Oniti ti pÉ›i nÉ–É› omɔũ yi kpaatri otÉ”u, kÉ› yÉ›̀ƒ oniti ba we, o yi É–o nnÉ› fÉ›hÉ”̀ƒfÉ›; o mÉ”kÉ›mu mÉ›cii kÉ›hĂ£ mÉ›yÉ›mmÉ›. Ti tĂº nÉ› É–o kenyari ti tÉ”bÉ› mbÉ› kÉ› yie mii ba nkwuÉ” ko otÉ”u É–au.", "metadata": { "languages": [ "swa", @@ -2922,7 +2922,7 @@ { "type": "Title", "element_id": "58343bf1070d7f16553f03d984ab9241", - "text": "Dzongkha \u0f60\u0f42\u0fb2\u0f7c\u0f0b\u0f56\u0f0b\u0f58\u0f72\u0f0b\u0f5a\u0f74\u0f0b\u0f42\u0f0b\u0f62\u0f0b\u0f51\u0f63\u0f0b\u0f51\u0f56\u0f44\u0f0b\u0f42\u0f72\u0f0b\u0f50\u0f7c\u0f42\u0f0b\u0f63\u0f66\u0f0b\u0f66\u0f90\u0fb1\u0f7a\u0f66\u0f0b\u0f4f\u0f7a\u0f0b\u0f61\u0f7c\u0f51\u0f54\u0f0b\u0f63\u0f66\u0f0b \u0f42\u0f0b\u0f62\u0f0b\u0f63\u0f74\u0f0b\u0f56\u0f62\u0fa9\u0f72\u0f0b\u0f58\u0f50\u0f7c\u0f44\u0f0b\u0f51\u0f44\u0f0b\u0f50\u0f7c\u0f56\u0f0b\u0f51\u0f56\u0f44\u0f0b\u0f60\u0f51\u0fb2\u0f0b\u0f58\u0f49\u0f58\u0f0b\u0f66\u0fa6\u0f7a\u0f0b\u0f61\u0f7c\u0f51\u0f0d \u0f58\u0f72\u0f0b\u0f5a\u0f74\u0f0b\u0f42\u0f0b\u0f62\u0f0b\u0f66\u0fa8\u0fb2\u0f0b\u0f64\u0f7a\u0f66\u0f0b\u0f51\u0f7c\u0f53\u0f0b\u0f42\u0f7c\u0f0b\u0f56\u0f60\u0f72\u0f0b\u0f58\u0f5a\u0f53\u0f0b\u0f49\u0f72\u0f51\u0f0b\u0f51\u0f44\u0f0b\u0f63\u0fa1\u0f53\u0f58\u0f0b\u0f63\u0f66\u0f0b \u0f42\u0f0b\u0f62\u0f0b\u0f42\u0f72\u0f66\u0f0b\u0f63\u0f71\u0f0b\u0f42\u0f0b\u0f45\u0f72\u0f0b\u0f62\u0f0b\u0f60\u0f56\u0f51\u0f0b\u0f62\u0f74\u0f44\u0f0b \u0f42\u0f45\u0f72\u0f42\u0f0b\u0f42\u0f72\u0f66\u0f0b\u0f42\u0f45\u0f72\u0f42\u0f0b\u0f63\u0f74\u0f0b\u0f66\u0fa4\u0f74\u0f53\u0f0b\u0f46\u0f60\u0f72\u0f0b\u0f60\u0f51\u0f74\u0f0b\u0f64\u0f7a\u0f66\u0f0b\u0f56\u0f66\u0f90\u0fb1\u0f7a\u0f51\u0f0b\u0f50\u0f7c\u0f42\u0f0b\u0f63\u0f66\u0f0b\u0f63\u0f71\u0f0b\u0f60\u0f56\u0f51\u0f0b\u0f51\u0f42\u0f7c\u0f0d", + "text": "Dzongkha འགྲོ་བ་མི་à½à½´à¼‹à½‚་ར་དལ་དབང་གི་à½à½¼à½‚་ལས་སà¾à¾±à½ºà½¦à¼‹à½à½ºà¼‹à½¡à½¼à½‘པ་ལས་ ག་ར་ལུ་བརྩི་མà½à½¼à½„་དང་à½à½¼à½–་དབང་འདྲ་མཉམ་སྦེ་ཡོད༠མི་à½à½´à¼‹à½‚་ར་སྨྲ་ཤེས་དོན་གོ་བའི་མà½à½“་ཉིད་དང་ལྡནམ་ལས་ ག་ར་གིས་ལཱ་ག་ཅི་ར་འབད་རུང་ གཅིག་གིས་གཅིག་ལུ་སྤུན་ཆའི་འདུ་ཤེས་བསà¾à¾±à½ºà½‘་à½à½¼à½‚་ལས་ལཱ་འབད་དགོà¼", "metadata": { "filetype": "text/plain", "data_source": { @@ -2982,7 +2982,7 @@ { "type": "NarrativeText", "element_id": "7d5794631564e8ff8a2bf245087903a4", - "text": "Ese Ejja Ojja\u00f1a esejja ojja\u00f1a oyaja yojjaya cuayani quiapame oyajayojjaya quiapame ojja\u00f1a eseya quiapame quia tai jjashauabataiquiani ecueya epejji jayo jjaya ojja\u00f1a jajji ojja\u00f1ajaassi eseyajayojja.", + "text": "Ese Ejja Ojjaña esejja ojjaña oyaja yojjaya cuayani quiapame oyajayojjaya quiapame ojjaña eseya quiapame quia tai jjashauabataiquiani ecueya epejji jayo jjaya ojjaña jajji ojjañajaassi eseyajayojja.", "metadata": { "languages": [ "swa", @@ -3006,7 +3006,7 @@ { "type": "NarrativeText", "element_id": "5f8fd43155bbf931b71069f21ba6a609", - "text": "Esperanto \u0108iuj homoj estas denaske liberaj kaj egalaj la\u016d digno kaj rajtoj. Ili posedas racion kaj konsciencon, kaj devus konduti unu al alia en spirito de frateco.", + "text": "Esperanto Ĉiuj homoj estas denaske liberaj kaj egalaj laÅ­ digno kaj rajtoj. Ili posedas racion kaj konsciencon, kaj devus konduti unu al alia en spirito de frateco.", "metadata": { "languages": [ "slv", @@ -3028,7 +3028,7 @@ { "type": "NarrativeText", "element_id": "e59c6075ee4dbde4faa66c2bdc180029", - "text": "Estonian K\u00f5ik inimesed s\u00fcnnivad vabadena ja v\u00f5rdsetena oma v\u00e4\u00e4rikuselt ja \u00f5igustelt. Neile on antud m\u00f5istus ja s\u00fcdametunnistus ja nende suhtumist \u00fcksteisesse peab kandma vendluse vaim.", + "text": "Estonian Kõik inimesed sĂ¼nnivad vabadena ja võrdsetena oma väärikuselt ja õigustelt. Neile on antud mõistus ja sĂ¼dametunnistus ja nende suhtumist Ă¼ksteisesse peab kandma vendluse vaim.", "metadata": { "languages": [ "est" @@ -3049,7 +3049,7 @@ { "type": "NarrativeText", "element_id": "699838930374f69143263bd99d88883e", - "text": "Even \u0411\u044d\u0439\u0438\u043b \u0431\u043e\u043a\u044d\u0442\u0447\u0443\u0440 \u043e\u043c\u044d\u043d \u0445\u0438\u043b\u043a\u0438\u0447 \u043d\u044f\u043d \u0443\u0440\u0443\u043c\u043a\u044d\u0440 \u0431\u0430\u043b\u0434\u0430\u0440\u0438\u0442\u043d\u043e, \u0442\u0435\u043c\u0438 \u043d\u043e\u04a5\u0430\u0440\u0434\u0443\u043a \u044d\u0433\u0434\u044c\u044d\u043d \u04a5\u0438\u2010\u0434\u0430 \u0430\u0447\u0447\u0430. \u0411\u044d\u0439\u0438\u043b \u0431\u04e9\u043a\u044d\u0442\u0447\u0443\u0440 \u043c\u044d\u043d \u0434\u043e\u043b\u0430\u043d \u0430\u043a\u0430\u0433\u0447\u0438\u043c\u0443\u0440 \u0431\u0438\u043d\u043d\u044d\u0442\u044b\u043d.", + "text": "Even Đ‘ÑĐ¹Đ¸Đ» Đ±Đ¾ĐºÑтчур Đ¾Đ¼ÑĐ½ Ñ…Đ¸Đ»ĐºĐ¸Ñ‡ Đ½ÑĐ½ ÑƒÑ€ÑƒĐ¼ĐºÑÑ€ Đ±Đ°Đ»Đ´Đ°Ñ€Đ¸Ñ‚Đ½Đ¾, Ñ‚ĐµĐ¼Đ¸ Đ½Đ¾̉¥Đ°Ñ€Đ´ÑƒĐº ÑĐ³Đ´ÑŒÑĐ½ ̉¥Đ¸â€Đ´Đ° Đ°Ñ‡Ñ‡Đ°. Đ‘ÑĐ¹Đ¸Đ» Đ±Ó©ĐºÑтчур Đ¼ÑĐ½ Đ´Đ¾Đ»Đ°Đ½ Đ°ĐºĐ°Đ³Ñ‡Đ¸Đ¼ÑƒÑ€ Đ±Đ¸Đ½Đ½ÑÑ‚Ñ‹Đ½.", "metadata": { "languages": [ "rus" @@ -3070,7 +3070,7 @@ { "type": "NarrativeText", "element_id": "8164afd787069e69d3a6bed633cfdb21", - "text": "Evenki \u0423\u043f\u043a\u0430\u0442 \u0438\u043b\u044d\u043b \u0442\u044b\u0304\u043d\u043c\u0443\u043a\u0438\u0440\u0434\u0438, \u0443\u0440\u044d\u0304\u043b\u0434\u0438 \u043c\u044d\u0304\u043d\u04a3\u0438 \u0441\u0430\u0304\u0440\u0438\u0447\u0430\u0304\u0434\u0438 \u0431\u0430\u043b\u0434\u044b\u0434\u044f\u0440\u0430. \u041d\u0443\u04a3\u0430\u0440\u0442\u044b\u043d \u0434\u044f\u043b\u0438\u0442\u0432\u0438, \u04bb\u0430\u043b\u0434\u044f\u043d\u0434\u044b\u0432\u0438 \u0431\u0438\u0441\u0438, \u043c\u044d\u043c\u044d\u0433\u0438\u0304\u043b\u0432\u044d\u0440 \u0430\u044f\u0440\u0430\u043b\u0434\u044b\u0304\u0434\u044f\u043d\u0430 \u0442\u044d\u0434\u0435\u0442 \u043e\u0304\u043c\u0430\u043c\u0430\u0447\u0438\u0442\u044b\u043d.", + "text": "Evenki Đ£Đ¿ĐºĐ°Ñ‚ илÑĐ» ты̀„Đ½Đ¼ÑƒĐºĐ¸Ñ€Đ´Đ¸, урÑ̀„лди Đ¼Ñ̀„Đ½̉£Đ¸ Ñа̀„Ñ€Đ¸Ñ‡Đ°̀„ди Đ±Đ°Đ»Đ´Ñ‹Đ´ÑÑ€Đ°. Đу̉£Đ°Ñ€Ñ‚Ñ‹Đ½ Đ´ÑĐ»Đ¸Ñ‚Đ²Đ¸, ̉»Đ°Đ»Đ´ÑĐ½Đ´Ñ‹Đ²Đ¸ биÑи, Đ¼ÑĐ¼ÑĐ³Đ¸̀„Đ»Đ²ÑÑ€ аÑÑ€Đ°Đ»Đ´Ñ‹̀„Đ´ÑĐ½Đ° Ñ‚ÑĐ´ĐµÑ‚ Đ¾̀„Đ¼Đ°Đ¼Đ°Ñ‡Đ¸Ñ‚Ñ‹Đ½.", "metadata": { "languages": [ "rus" @@ -3091,7 +3091,7 @@ { "type": "NarrativeText", "element_id": "8ba9631d337f32fb2b5a0049718f7162", - "text": "\u00c9w\u00e9 Wodzi amegbet\u0254wo kata\u0303 abl\u0254\u0256eviwoe eye wodzena bubu kple gomekp\u0254kp\u0254 s\u0254s\u0254e. Susu kple dzitsinya le wo domet\u0254 \u0256esia\u0256e si eyata wodze be woan\u0254 anyi le \u0256ekaw\u0254w\u0254 blibo me.", + "text": "ÉwĂ© Wodzi amegbetÉ”wo katàƒ ablɔɖeviwoe eye wodzena bubu kple gomekpÉ”kpÉ” sÉ”sÉ”e. Susu kple dzitsinya le wo dometÉ” É–esiaÉ–e si eyata wodze be woanÉ” anyi le É–ekawÉ”wÉ” blibo me.", "metadata": { "languages": [ "pol" @@ -3112,7 +3112,7 @@ { "type": "NarrativeText", "element_id": "4dad8f50be71b880b8d1cd3aa2083177", - "text": "Fante W\u0254wo adasa nyina to fahodzi mu, na h\u0254n nyina y\u025b p\u025br w\u0254 enyimnyam na ndzinoa mu. W\u0254maa h\u0254n nyina adwen na tsibowa, na \u0254w\u0254 d\u025b h\u0254n nkitahodzi mu ndzey\u025b\u025b da no edzi d\u025b w\u0254y\u025b enuanom.", + "text": "Fante WÉ”wo adasa nyina to fahodzi mu, na hÉ”n nyina yÉ› pÉ›r wÉ” enyimnyam na ndzinoa mu. WÉ”maa hÉ”n nyina adwen na tsibowa, na É”wÉ” dÉ› hÉ”n nkitahodzi mu ndzeyɛɛ da no edzi dÉ› wÉ”yÉ› enuanom.", "metadata": { "languages": [ "swa", @@ -3134,7 +3134,7 @@ { "type": "NarrativeText", "element_id": "f8e68d4590ad494f5d3039e113c1ac46", - "text": "Faroese \u00d8ll menniskju eru f\u00f8dd fr\u00e6ls og j\u00f8vn til vir\u00f0ingar og mannar\u00e6ttindi. Tey hava skil og samvitsku og eiga at fara hv\u00f8rt um anna\u00f0 \u00ed br\u00f3\u00f0uranda.", + "text": "Faroese Ă˜ll menniskju eru fødd fræls og jøvn til virðingar og mannarættindi. Tey hava skil og samvitsku og eiga at fara hvørt um annað Ă­ brĂ³Ă°uranda.", "metadata": { "languages": [ "nor" @@ -3155,7 +3155,7 @@ { "type": "NarrativeText", "element_id": "2f3af719eba5f3392f87df0894e56c42", - "text": "Farsi, Western \u062a\u0645\u0627\u0645 \u0627\u0641\u0631\u0627\u062f \u0628\u0634\u0631 \u0622\u0632\u0627\u062f \u0628\u062f\u0646\u06cc\u0627 \u0645\u06cc\u0627\u06cc\u0646\u062f \u0648 \u0627\u0632 \u0644\u062d\u0627\u0638 \u062d\u06cc\u062b\u06cc\u062a \u0648 \u062d\u0642\u0648\u0642 \u0628\u0627 \u0647\u0645 \u0628\u0631\u0627\u0628\u0631\u0646\u062f. \u0647\u0645\u0647 \u062f\u0627\u0631\u0627\u06cc \u0639\u0642\u0644 \u0648 \u0648\u062c\u062f\u0627\u0646 \u0645\u06cc\u0628\u0627\u0634\u0646\u062f \u0648 \u0628\u0627\u06cc\u062f \u0646\u0633\u0628\u062a \u0628\u06cc\u06a9\u062f\u06cc\u06af\u0631 \u0628\u0627 \u0631\u0648\u062d \u0628\u0631\u0627\u062f\u0631\u06cc \u0631\u0641\u062a\u0627\u0631 \u06a9\u0646\u0646\u062f.", + "text": "Farsi, Western تمام Ø§ÙØ±Ø§Ø¯ بشر آزاد بدنیا میایند Ùˆ از لحاظ حیثیت Ùˆ حقوق با هم برابرند. همه دارای عقل Ùˆ وجدان میباشند Ùˆ باید نسبت بیکدیگر با روح برادری Ø±ÙØªØ§Ø± کنند.", "metadata": { "languages": [ "fas" @@ -3198,7 +3198,7 @@ { "type": "NarrativeText", "element_id": "b70785870cc673f7dcbb24c8464d43fc", - "text": "Finnish Kaikki ihmiset syntyv\u00e4t vapaina ja tasavertaisina arvoltaan ja oikeuksiltaan. Heille on annettu j\u00e4rki ja omatunto, ja heid\u00e4n on toimittava toisiaan kohtaan veljeyden hengess\u00e4.", + "text": "Finnish Kaikki ihmiset syntyvät vapaina ja tasavertaisina arvoltaan ja oikeuksiltaan. Heille on annettu järki ja omatunto, ja heidän on toimittava toisiaan kohtaan veljeyden hengessä.", "metadata": { "languages": [ "fin" @@ -3219,7 +3219,7 @@ { "type": "NarrativeText", "element_id": "ecc193afbaf5bf317c868860f5dfc5ec", - "text": "Finnish, Kven Kaikki ihmiset synnyth\u00e4\u00e4n vaphaina, ja heil\u00e4 kaikila oon sama ihmisarvo ja samat ihmisoikkeuet. Het oon saanheet j\u00e4rjen ja omatunnon, ja het pieth\u00e4\u00e4n ell\u00e4\u00e4t toinen toisen kans niin ko veljet keskenh\u00e4\u00e4n.", + "text": "Finnish, Kven Kaikki ihmiset synnythään vaphaina, ja heilä kaikila oon sama ihmisarvo ja samat ihmisoikkeuet. Het oon saanheet järjen ja omatunnon, ja het piethään elläät toinen toisen kans niin ko veljet keskenhään.", "metadata": { "languages": [ "fin" @@ -3240,7 +3240,7 @@ { "type": "NarrativeText", "element_id": "e2a252e076d508cd7e312c25eaf70331", - "text": "Fon Ac\u025b, susu kpo sisi \u0256okpo \u0254 kpo w\u025b gb\u025bt\u0254 bi \u0256o \u0256\u00f2 gb\u025bwiwa t\u0254n hwenu; ye \u0256o linkp\u0254n b\u0254 ayi yet\u0254n m\u025b kpe lo b\u0254 ye \u0256o na do al\u0254 ye\u0256ee \u0256i n\u0254vin\u0254vi \u0256\u0254hun.", + "text": "Fon AcÉ›, susu kpo sisi É–okpo É” kpo wÉ› gbÉ›tÉ” bi É–o É–Ă² gbÉ›wiwa tÉ”n hwenu; ye É–o linkpÉ”n bÉ” ayi yetÉ”n mÉ› kpe lo bÉ” ye É–o na do alÉ” yeÉ–ee É–i nÉ”vinÉ”vi É–É”hun.", "metadata": { "languages": [ "swa", @@ -3262,7 +3262,7 @@ { "type": "NarrativeText", "element_id": "d26195c0225bad321fc98f526b1fb27b", - "text": "French Tous les \u00eatres humains naissent libres et \u00e9gaux en dignit\u00e9 et en droits. Ils sont dou\u00e9s de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternit\u00e9.", + "text": "French Tous les Ăªtres humains naissent libres et Ă©gaux en dignitĂ© et en droits. Ils sont douĂ©s de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternitĂ©.", "metadata": { "languages": [ "fra" @@ -3283,7 +3283,7 @@ { "type": "NarrativeText", "element_id": "f5ce0eb3d199445ab33436a396fca8cb", - "text": "Frisian, Western Alle minsken wurde frij en gelyk yn weardigens en rjochten berne. Hja hawwe ferst\u00e2n en gewisse meikrigen en hearre har foar inoar oer yn in geast fan bruorskip te h\u00e2lden en te dragen.", + "text": "Frisian, Western Alle minsken wurde frij en gelyk yn weardigens en rjochten berne. Hja hawwe ferstĂ¢n en gewisse meikrigen en hearre har foar inoar oer yn in geast fan bruorskip te hĂ¢lden en te dragen.", "metadata": { "languages": [ "nld", @@ -3305,7 +3305,7 @@ { "type": "NarrativeText", "element_id": "0da991393fa9f40d78c4143c3a25b02a", - "text": "Friulian Ducj i oms a nassin libars e compagns come dignit\u00e2t e derits. A an sintiment e cussience e bisugne che si tratin un culaltri come fradis.", + "text": "Friulian Ducj i oms a nassin libars e compagns come dignitĂ¢t e derits. A an sintiment e cussience e bisugne che si tratin un culaltri come fradis.", "metadata": { "languages": [ "ita" @@ -3326,7 +3326,7 @@ { "type": "NarrativeText", "element_id": "216db5a1011f211d9206a47a9e0e4839", - "text": "Fulfulde, Nigerian Innama aadeeji fof poti, ndim\u0257idi e jibinannde to bannge hakkeeji. E\u0253e ngoodi miijo e hakkilantaagal ete e\u0253e poti huufo ndirde e nder \u0253 iynguyummaagu.", + "text": "Fulfulde, Nigerian Innama aadeeji fof poti, ndimÉ—idi e jibinannde to bannge hakkeeji. EÉ“e ngoodi miijo e hakkilantaagal ete eÉ“e poti huufo ndirde e nder É“ iynguyummaagu.", "metadata": { "languages": [ "est", @@ -3348,7 +3348,7 @@ { "type": "NarrativeText", "element_id": "d245ad5ed3e4ee8727b8152745ffdba6", - "text": "Fulfulde, Nigerian (2) \u0181i-aadama fuu dimo danyete/jibinte o fotan be koomoye e ne\u0257\u0257aaku be hakkeeji. \u0253e ndokkaa\u0253e hakkiilo ngaandi nden bo \u0253e kuutindiray hakkunde ma\u0253\u0253e nder yi\u0257yi\u0257\u0257irki mbandiraagu.", + "text": "Fulfulde, Nigerian (2) Æi-aadama fuu dimo danyete/jibinte o fotan be koomoye e neÉ—É—aaku be hakkeeji. É“e ndokkaaÉ“e hakkiilo ngaandi nden bo É“e kuutindiray hakkunde maɓɓe nder yiÉ—yiÉ—É—irki mbandiraagu.", "metadata": { "languages": [ "som", @@ -3371,7 +3371,7 @@ { "type": "NarrativeText", "element_id": "71e526a7453aa9c044c6f695d1fe4c78", - "text": "Fur kwa-s\u00ed ny\u00e9tti\u014b baajt\u00f3l\u00e1 kereli n\u00e1s nisila na ta\u0331g\u0268d\u0268\u014b arr\u00e1 ka\u0331\u0268\u014b, Na\u014b-s\u00ed ugola na kilma\u014b\u00e1 arr\u00e1 ka\u0331\u0268\u014b nam\u00e1 in l\u00f3\u014b \u00e1l\u00e1\u014b s\u01d4r\u014b\u00e2-s\u00ed k\u00ed jai\u014ba in k\u00e9\u00e9l n\u00e1 s\u01d4r\u014b\u00e2 suur\ua78c\u00ed\u014b b\u00e2r\u014ba.", + "text": "Fur kwa-sĂ­ nyĂ©ttiÅ‹ baajtĂ³lĂ¡ kereli nĂ¡s nisila na tà±gɨdɨŋ arrĂ¡ kà±É¨Å‹, NaÅ‹-sĂ­ ugola na kilmaÅ‹Ă¡ arrĂ¡ kà±É¨Å‹ namĂ¡ in lĂ³Å‹ Ă¡lĂ¡Å‹ sÇ”rÅ‹Ă¢-sĂ­ kĂ­ jaiÅ‹a in kéél nĂ¡ sÇ”rÅ‹Ă¢ suurêŒĂ­Å‹ bĂ¢rÅ‹a.", "metadata": { "languages": [ "hun" @@ -3392,7 +3392,7 @@ { "type": "NarrativeText", "element_id": "dfd804850bd4d6daab5db7227283c3ab", - "text": "Ga Af\u0254 gb\u0254m\u0254 f\u025b\u025b gb\u0254m\u0254 y\u025b agbojee mli, k\u025b hegb\u025b ko ni dam\u0254 \u014b\u025bl\u025b koome n\u0254. Gb\u0254m\u025bi f\u025b\u025b y\u025b jw\u025b\u014bm\u0254 k\u025b henilee, ni no hew\u0254 l\u025b esa ak\u025b am\u025bhe ahi shi y\u025b ny\u025bmi su\u0254m\u0254 mli.", + "text": "Ga AfÉ” gbÉ”mÉ” fɛɛ gbÉ”mÉ” yÉ› agbojee mli, kÉ› hegbÉ› ko ni damÉ” ŋɛlÉ› koome nÉ”. GbÉ”mÉ›i fɛɛ yÉ› jwɛŋmÉ” kÉ› henilee, ni no hewÉ” lÉ› esa akÉ› amÉ›he ahi shi yÉ› nyÉ›mi suÉ”mÉ” mli.", "metadata": { "languages": [ "swa", @@ -3414,7 +3414,7 @@ { "type": "NarrativeText", "element_id": "38140682ca9cf0b5c7f1cf203b331589", - "text": "Gaelic, Irish Saol\u00e1itear na daoine uile saor agus comhionann ina nd\u00ednit agus ina gcearta. T\u00e1 bauidh an r\u00e9as\u00fain agus an choinsiasa acu agus dl\u00edd iad f\u00e9in d'iompar de mheon bhrthreachais i leith a ch\u00e9ile.", + "text": "Gaelic, Irish SaolĂ¡itear na daoine uile saor agus comhionann ina ndĂ­nit agus ina gcearta. TĂ¡ bauidh an rĂ©asĂºin agus an choinsiasa acu agus dlĂ­d iad fĂ©in d'iompar de mheon bhrthreachais i leith a chĂ©ile.", "metadata": { "languages": [ "eng", @@ -3436,7 +3436,7 @@ { "type": "NarrativeText", "element_id": "c74c5c12c1d20c63c0512bda5ec488ee", - "text": "Gaelic, Scottish Tha gach uile dhuine air a bhreth saor agus co-ionnan ann an urram 's ann an c\u00f2irichean. Tha iad air am breth le reusan is le cogais agus mar sin bu ch\u00f2ir dhaibh a bhith be\u00f2 nam measg fhein ann an spiorad br\u00e0thaireil,", + "text": "Gaelic, Scottish Tha gach uile dhuine air a bhreth saor agus co-ionnan ann an urram 's ann an cĂ²irichean. Tha iad air am breth le reusan is le cogais agus mar sin bu chĂ²ir dhaibh a bhith beĂ² nam measg fhein ann an spiorad brĂ thaireil,", "metadata": { "languages": [ "eng" @@ -3457,7 +3457,7 @@ { "type": "NarrativeText", "element_id": "adb7eafcda17469d6dffe53ac281b9e7", - "text": "Gagauz Insannar hepsi duu\u00earlar serbest hem birtak\u0131m kendi k\u0131ymetind\u00e4 hem haklar\u0131nda. Onnara verilmi\u015f ak\u0131l hem \u00fcz da l\u00e4az\u0131m biri-birin\u00e4 davrans\u0131nnar karda\u015fl\u0131k ruhuna uygun.", + "text": "Gagauz Insannar hepsi duuĂªrlar serbest hem birtakım kendi kıymetindä hem haklarında. Onnara verilmiÅŸ akıl hem Ă¼z da läazım biri-birinä davransınnar kardaÅŸlık ruhuna uygun.", "metadata": { "languages": [ "tur" @@ -3478,7 +3478,7 @@ { "type": "NarrativeText", "element_id": "d838922d035c343059a70e88f83100af", - "text": "Galician T\u00f3dolos seres humanos nacen libres e iguais en dignidade e dereitos e, dotados como est\u00e1n de raz\u00f3n e conciencia, d\u00edbense comportar fraternalmente uns cos outros.", + "text": "Galician TĂ³dolos seres humanos nacen libres e iguais en dignidade e dereitos e, dotados como estĂ¡n de razĂ³n e conciencia, dĂ­bense comportar fraternalmente uns cos outros.", "metadata": { "languages": [ "spa" @@ -3522,7 +3522,7 @@ { "type": "UncategorizedText", "element_id": "ec7ace2c582cd24ef64d447f5e1e7a08", - "text": "Garifuna Sun g\u00fcrigia nas\u00edruati yuti lun, lidan \u00faarani, lawiwandun\u00ed lib\u00e1gari kai le aubai lab\u00fasienra, gatu gi\u00f1e lanagun lungua buidu hadan l\u00edbegu.", + "text": "Garifuna Sun gĂ¼rigia nasĂ­ruati yuti lun, lidan Ăºarani, lawiwandunĂ­ libĂ¡gari kai le aubai labĂºsienra, gatu giñe lanagun lungua buidu hadan lĂ­begu.", "metadata": { "languages": [ "ind" @@ -3543,7 +3543,7 @@ { "type": "NarrativeText", "element_id": "3db8c991f134adb8e84617cd84e56d43", - "text": "Gen Agbet\u0254wo kpata le jijim\u025ba, \u0256o vosin\u0254n\u0254, nyi gb\u00e8s\u0254\u025b\u0301m\u025b\u0301w\u00f3 le nuj\u0254nunnyi ku go\u0256oejisewo, am\u025bbusewo m\u025b. Tagb\u0254 le woa si, eye w\u0254nawo s\u0254doda woan\u0254n\u0254wo gb\u0254a la nyi n\u0254\u0301visil\u00e9l\u00e9.", + "text": "Gen AgbetÉ”wo kpata le jijimÉ›a, É–o vosinÉ”nÉ”, nyi gbèsɔɛ̀mÉ›̀wĂ³ le nujÉ”nunnyi ku goÉ–oejisewo, amÉ›busewo mÉ›. TagbÉ” le woa si, eye wÉ”nawo sÉ”doda woanÉ”nÉ”wo gbÉ”a la nyi nÉ”̀visilĂ©lĂ©.", "metadata": { "languages": [ "swa", @@ -3565,7 +3565,7 @@ { "type": "NarrativeText", "element_id": "cb7127a24ce99f60f18c47121fcbe3cb", - "text": "Georgian \u10e7\u10dd\u10d5\u10d4\u10da\u10d8 \u10d0\u10d3\u10d0\u10db\u10d8\u10d0\u10dc\u10d8 \u10d8\u10d1\u10d0\u10d3\u10d4\u10d1\u10d0 \u10d7\u10d0\u10d5\u10d8\u10e1\u10e3\u10e4\u10d0\u10da\u10d8 \u10d3\u10d0 \u10d7\u10d0\u10dc\u10d0\u10e1\u10ec\u10dd\u10e0\u10d8 \u10d7\u10d0\u10d5\u10d8\u10e1\u10d8 \u10e6\u10d8\u10e0\u10e1\u10d4\u10d1\u10d8\u10d7\u10d0 \u10d3\u10d0 \u10e3\u10e4\u10da\u10d4\u10d1\u10d4\u10d1\u10d8\u10d7. \u10db\u10d0\u10d7 \u10db\u10d8\u10dc\u10d8\u10ed\u10d4\u10d1\u10e3\u10da\u10d8 \u10d0\u10e5\u10d5\u10d7 \u10d2\u10dd\u10dc\u10d4\u10d1\u10d0 \u10d3\u10d0 \u10e1\u10d8\u10dc\u10d3\u10d8\u10e1\u10d8 \u10d3\u10d0 \u10d4\u10e0\u10d7\u10db\u10d0\u10dc\u10d4\u10d7\u10d8\u10e1 \u10db\u10d8\u10db\u10d0\u10e0\u10d7 \u10e3\u10dc\u10d3\u10d0 \u10d4\u10e5\u10ea\u10d4\u10dd\u10d3\u10dc\u10d4\u10dc \u10eb\u10db\u10dd\u10d1\u10d8\u10e1 \u10e1\u10e3\u10da\u10d8\u10e1\u10d9\u10d5\u10d4\u10d7\u10d4\u10d1\u10d8\u10d7.", + "text": "Georgian ყáƒáƒ•ეáƒáƒ˜ áƒáƒ“áƒáƒ›áƒ˜áƒáƒœáƒ˜ იბáƒáƒ“ებრთáƒáƒ•ისუფáƒáƒáƒ˜ დრთáƒáƒœáƒáƒ¡áƒ¬áƒáƒ áƒ˜ თáƒáƒ•ისი ღირსებითრდრუფáƒáƒ”ბებით. მáƒáƒ— მინიჭებუáƒáƒ˜ áƒáƒ¥áƒ•თ გáƒáƒœáƒ”ბრდრსინდისი დრერთმáƒáƒœáƒ”თის მიმáƒáƒ áƒ— უნდრექცეáƒáƒ“ნენ ძმáƒáƒ‘ის სუáƒáƒ˜áƒ¡áƒ™áƒ•ეთებით.", "metadata": { "languages": [ "est" @@ -3586,7 +3586,7 @@ { "type": "NarrativeText", "element_id": "60e95060440c3ac89b53764c839a9658", - "text": "German, Standard (1901) Alle Menschen sind frei und gleich an W\u00fcrde und Rechten geboren. Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist der Br\u00fcderlichkeit begegnen.", + "text": "German, Standard (1901) Alle Menschen sind frei und gleich an WĂ¼rde und Rechten geboren. Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist der BrĂ¼derlichkeit begegnen.", "metadata": { "languages": [ "deu" @@ -3607,7 +3607,7 @@ { "type": "NarrativeText", "element_id": "d9454188531f323f4587d2668a35dce4", - "text": "German, Standard (1996) Alle Menschen sind frei und gleich an W\u00fcrde und Rechten geboren. Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist der Br\u00fcderlichkeit begegnen.", + "text": "German, Standard (1996) Alle Menschen sind frei und gleich an WĂ¼rde und Rechten geboren. Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist der BrĂ¼derlichkeit begegnen.", "metadata": { "languages": [ "deu" @@ -3628,7 +3628,7 @@ { "type": "Title", "element_id": "82bf90db0534cabdc2efe2971f9bb4c6", - "text": "Gilyak \u0421\u0438\u043a \u043d\u0438\u0432\u0433\u0443\u043d \u043a\u0443\u0493\u044b\u0442\u04fb\u0430\u0440\u0442\u0430, \u043f\u02bc\u0438\u043d\u0430\u043c\u0430\u0434 \u044f\u0439\u043c\u0442\u0430 \u0430\u0434\u044f\u0439 \u043f\u0440\u0430\u0432\u043e\u0493\u0438\u0440\u030c \u043f\u02bc\u04ca\u0430\u0444\u049b-\u04ca\u0430\u0444\u049b\u0493\u0438\u0440\u030c \u0441\u0430\u043b\u04fb\u0430\u0442\u0430 \u04ff\u0430\u0442 \u043f\u0430\u043d\u0442\u0430\u0434\u0493\u0443\u043d.", + "text": "Gilyak Đ¡Đ¸Đº Đ½Đ¸Đ²Đ³ÑƒĐ½ ĐºÑƒ̉“Ñ‹Ñ‚Ó»Đ°Ñ€Ñ‚Đ°, Đ¿Ê¼Đ¸Đ½Đ°Đ¼Đ°Đ´ ÑĐ¹Đ¼Ñ‚Đ° адÑĐ¹ Đ¿Ñ€Đ°Đ²Đ¾̉“Đ¸Ñ€̀Œ Đ¿Ê¼ÓĐ°Ñ„̉›-ÓĐ°Ñ„̉›̉“Đ¸Ñ€̀Œ ÑĐ°Đ»Ó»Đ°Ñ‚Đ° Ó¿Đ°Ñ‚ Đ¿Đ°Đ½Ñ‚Đ°Đ´̉“ÑƒĐ½.", "metadata": { "languages": [ "bul", @@ -3650,7 +3650,7 @@ { "type": "NarrativeText", "element_id": "d61fdd2d22e77149dff43d70d62d722f", - "text": "Gonja Bu kurwe dimedi kik\u025b mobe kumu so, n\u025b mobe, eyilikpa, keshe\u014b n\u025b kashinte\u014b ma\u014b k\u0254r eko pey\u025b to. Nyinpela sa dimedi kik\u025b lakal n\u025b mf\u025bra fan\u025b bu chena abarso kelepo so.", + "text": "Gonja Bu kurwe dimedi kikÉ› mobe kumu so, nÉ› mobe, eyilikpa, kesheÅ‹ nÉ› kashinteÅ‹ maÅ‹ kÉ”r eko peyÉ› to. Nyinpela sa dimedi kikÉ› lakal nÉ› mfÉ›ra fanÉ› bu chena abarso kelepo so.", "metadata": { "languages": [ "swa", @@ -3673,7 +3673,7 @@ { "type": "NarrativeText", "element_id": "0361867eb371916c85e13fcc3dde7f4b", - "text": "Greek (monotonic) \u038c\u03bb\u03bf\u03b9 \u03bf\u03b9 \u03ac\u03bd\u03b8\u03c1\u03c9\u03c0\u03bf\u03b9 \u03b3\u03b5\u03bd\u03bd\u03b9\u03bf\u03cd\u03bd\u03c4\u03b1\u03b9 \u03b5\u03bb\u03b5\u03cd\u03b8\u03b5\u03c1\u03bf\u03b9 \u03ba\u03b1\u03b9 \u03af\u03c3\u03bf\u03b9 \u03c3\u03c4\u03b7\u03bd \u03b1\u03be\u03b9\u03bf\u03c0\u03c1\u03ad\u03c0\u03b5\u03b9\u03b1 \u03ba\u03b1\u03b9 \u03c4\u03b1 \u03b4\u03b9\u03ba\u03b1\u03b9\u03ce\u03bc\u03b1\u03c4\u03b1. \u0395\u03af\u03bd\u03b1\u03b9 \u03c0\u03c1\u03bf\u03b9\u03ba\u03b9\u03c3\u03bc\u03ad\u03bd\u03bf\u03b9 \u03bc\u03b5 \u03bb\u03bf\u03b3\u03b9\u03ba\u03ae \u03ba\u03b1\u03b9 \u03c3\u03c5\u03bd\u03b5\u03af\u03b4\u03b7\u03c3\u03b7, \u03ba\u03b1\u03b9 \u03bf\u03c6\u03b5\u03af\u03bb\u03bf\u03c5\u03bd \u03bd\u03b1 \u03c3\u03c5\u03bc\u03c0\u03b5\u03c1\u03b9\u03c6\u03ad\u03c1\u03bf\u03bd\u03c4\u03b1\u03b9 \u03bc\u03b5\u03c4\u03b1\u03be\u03cd \u03c4\u03bf\u03c5\u03c2 \u03bc\u03b5 \u03c0\u03bd\u03b5\u03cd\u03bc\u03b1 \u03b1\u03b4\u03b5\u03bb\u03c6\u03bf\u03c3\u03cd\u03bd\u03b7\u03c2.", + "text": "Greek (monotonic) Όλοι οι άνθÏωποι γεννιοÏνται ελεÏθεÏοι και ίσοι στην αξιοπÏέπεια και τα δικαιÏματα. Είναι Ï€Ïοικισμένοι με λογική και συνείδηση, και οφείλουν να συμπεÏιφέÏονται Î¼ÎµÏ„Î±Î¾Ï Ï„Î¿Ï…Ï‚ με πνεÏμα αδελφοσÏνης.", "metadata": { "languages": [ "ell" @@ -3694,7 +3694,7 @@ { "type": "NarrativeText", "element_id": "ef30df67b6cbf4e05af379e61e529561", - "text": "Greek (polytonic) \u1f4d\u03bb\u03bf\u03b9 \u03bf\u1f31 \u1f04\u03bd\u03b8\u03c1\u03c9\u03c0\u03bf\u03b9 \u03b3\u03b5\u03bd\u03bd\u03b9\u03bf\u1fe6\u03bd\u03c4\u03b1\u03b9 \u1f10\u03bb\u03b5\u1f7b\u03b8\u03b5\u03c1\u03bf\u03b9 \u03ba\u03b1\u1f76 \u1f34\u03c3\u03bf\u03b9 \u03c3\u03c4\u1f74\u03bd \u1f00\u03be\u03b9\u03bf\u03c0\u03c1\u1f73\u03c0\u03b5\u03b9\u03b1 \u03ba\u03b1\u1f76 \u03c4\u1f70 \u03b4\u03b9\u03ba\u03b1\u03b9\u1f7d\u03bc\u03b1\u03c4\u03b1. \u0395\u1f36\u03bd\u03b1\u03b9 \u03c0\u03c1\u03bf\u03b9\u03ba\u03b9\u03c3\u03bc\u1f73\u03bd\u03bf\u03b9 \u03bc\u1f72 \u03bb\u03bf\u03b3\u03b9\u03ba\u1f74 \u03ba\u03b1\u1f76 \u03c3\u03c5\u03bd\u03b5\u1f77\u03b4\u03b7\u03c3\u03b7, \u03ba\u03b1\u1f76 \u1f40\u03c6\u03b5\u1f77\u03bb\u03bf\u03c5\u03bd \u03bd\u1f70 \u03c3\u03c5\u03bc\u03c0\u03b5\u03c1\u03b9\u03c6\u1f73\u03c1\u03bf\u03bd\u03c4\u03b1\u03b9 \u03bc\u03b5\u03c4\u03b1\u03be\u1f7b \u03c4\u03bf\u03c5\u03c2 \u03bc\u1f72 \u03c0\u03bd\u03b5\u1fe6\u03bc\u03b1 \u1f00\u03b4\u03b5\u03bb\u03c6\u03bf\u03c3\u1f7b\u03bd\u03b7\u03c2.", + "text": "Greek (polytonic) á½Î»Î¿Î¹ οἱ ἄνθÏωποι γεννιοῦνται á¼Î»Îµá½»Î¸ÎµÏοι καὶ ἴσοι στὴν ἀξιοπÏέπεια καὶ τὰ δικαιώματα. Εἶναι Ï€Ïοικισμένοι μὲ λογικὴ καὶ συνείδηση, καὶ ὀφείλουν νὰ συμπεÏιφέÏονται μεταξύ τους μὲ πνεῦμα ἀδελφοσύνης.", "metadata": { "languages": [ "ell" @@ -3715,7 +3715,7 @@ { "type": "NarrativeText", "element_id": "a8aaedf9144ce4af4a672873d93945c2", - "text": "Guaran\u00ed, Paraguayan Mayma yvyp\u00f3ra ou ko yvy \u00e1ri i\u00f1apytl\u02bcyre ha ete\u0129cha dignidad ha derecho jeguerek\u00f3pe; ha ikatu rupi oikuaa a\u00f1et\u00e9va ha a\u00f1ete\u02bcyva, ipor\u00e3va ha iva\u00edva, tekotev\u1ebd pehengu\u00e9icha oiko o\u00f1ondiveku\u00e9ra.", + "text": "GuaranĂ­, Paraguayan Mayma yvypĂ³ra ou ko yvy Ă¡ri iñapytlʼyre ha eteÄ©cha dignidad ha derecho jeguerekĂ³pe; ha ikatu rupi oikuaa añetĂ©va ha añeteʼyva, iporĂ£va ha ivaĂ­va, tekotevẽ pehenguĂ©icha oiko oñondivekuĂ©ra.", "metadata": { "languages": [ "slk", @@ -3739,7 +3739,7 @@ { "type": "NarrativeText", "element_id": "1a8dccbb2225da58c6c32c944346a88f", - "text": "Guarayu Opakatu ava yoro\u2019a nda\u2019ei tembigwaigwa oyoyatupri, sekotupri, va\u00ebra, imboeteisara, oikatu ipi\u2019a yemo\u00f1eta, imbaekua, ndiyai yurekorairai \u00f1ep\u00ebi p\u00ebi ambua rese.", + "text": "Guarayu Opakatu ava yoro’a nda’ei tembigwaigwa oyoyatupri, sekotupri, vaĂ«ra, imboeteisara, oikatu ipi’a yemoñeta, imbaekua, ndiyai yurekorairai ñepĂ«i pĂ«i ambua rese.", "metadata": { "languages": [ "ind", @@ -3761,7 +3761,7 @@ { "type": "NarrativeText", "element_id": "2aff799c80d0ba06e344f3b917c6aa5a", - "text": "Gujarati \u0aaa\u0acd\u0ab0\u0aa4\u0abf\u0ab7\u0acd\u0aa0\u0abe \u0a85\u0aa8\u0ac7 \u0a85\u0aa7\u0abf\u0a95\u0abe\u0ab0\u0acb\u0aa8\u0ac0 \u0aa6\u0ac3\u0ab7\u0acd\u0a9f\u0abf\u0a8f \u0ab8\u0ab0\u0acd\u0ab5 \u0aae\u0abe\u0aa8\u0ab5\u0acb \u0a9c\u0aa8\u0acd\u0aae\u0aa5\u0ac0 \u0ab8\u0acd\u0ab5\u0aa4\u0a82\u0aa4\u0acd\u0ab0 \u0a85\u0aa8\u0ac7 \u0ab8\u0aae\u0abe\u0aa8 \u0ab9\u0acb\u0aaf \u0a9b\u0ac7. \u0aa4\u0ac7\u0aae\u0aa8\u0abe\u0aae\u0abe\u0a82 \u0ab5\u0abf\u0a9a\u0abe\u0ab0\u0ab6\u0a95\u0acd\u0aa4\u0abf \u0a85\u0aa8\u0ac7 \u0a85\u0a82\u0aa4\u0a83\u0a95\u0ab0\u0aa3 \u0ab9\u0acb\u0aaf \u0a9b\u0ac7 \u0a85\u0aa8\u0ac7 \u0aa4\u0ac7\u0aae\u0aa3\u0ac7 \u0aaa\u0ab0\u0ab8\u0acd\u0aaa\u0ab0 \u0aac\u0a82\u0aa7\u0ac1\u0aa4\u0acd\u0ab5\u0aa8\u0ac0 \u0aad\u0abe\u0ab5\u0aa8\u0abe\u0aa5\u0ac0 \u0ab5\u0ab0\u0acd\u0aa4\u0ab5\u0ac1\u0a82 \u0a9c\u0acb\u0a87\u0a8f.", + "text": "Gujarati પà«àª°àª¤àª¿àª·à«àª àª¾ અને અધિકારોની દૃષà«àªŸàª¿àª સરà«àªµ માનવો જનà«àª®àª¥à«€ સà«àªµàª¤àª‚તà«àª° અને સમાન હોય છે. તેમનામાં વિàªàª¾àª°àª¶àª•à«àª¤àª¿ અને અંતઃકરણ હોય છે અને તેમણે પરસà«àªªàª° બંધà«àª¤à«àªµàª¨à«€ ભાવનાથી વરà«àª¤àªµà«àª‚ જોઇàª.", "metadata": { "languages": [ "guj" @@ -3782,7 +3782,7 @@ { "type": "NarrativeText", "element_id": "7c7879f1335e2e8f7c0ca4a80cb6d9fc", - "text": "Gumuz Dub\ua78caga b\ua78caga metaam metaam alamaam kamaanzaak\ua78coma kas\ua78ce bipok\ua78coga kamad\ua78cab maafuc\ua78cak\ua78cwa haaga bac\ua78caga tso. Ka\u0301b\ua78caga jajanda kwa jala etigafalagash ma\ua78ciiya nago metaagwa eyaal yida-eba bic\ua78caga tso.", + "text": "Gumuz DubêŒaga bêŒaga metaam metaam alamaam kamaanzaakêŒoma kasêŒe bipokêŒoga kamadêŒab maafucêŒakêŒwa haaga bacêŒaga tso. KàbêŒaga jajanda kwa jala etigafalagash maêŒiiya nago metaagwa eyaal yida-eba bicêŒaga tso.", "metadata": { "languages": [ "som" @@ -3803,7 +3803,7 @@ { "type": "NarrativeText", "element_id": "c591dbcd933d69898871c75fc9b2c5b8", - "text": "Haitian Creole French (Kreyol) Tout moun f\u00e8t lib, egal ego pou diyite kou w\u00e8 dwa. Nou gen la rezon ak la konsyans epi nou f\u00e8t pou nou aji youn ak lot ak yon lespri fwat\u00e8nite.", + "text": "Haitian Creole French (Kreyol) Tout moun fèt lib, egal ego pou diyite kou wè dwa. Nou gen la rezon ak la konsyans epi nou fèt pou nou aji youn ak lot ak yon lespri fwatènite.", "metadata": { "languages": [ "fra" @@ -3824,7 +3824,7 @@ { "type": "NarrativeText", "element_id": "1caef318c81d61c240de817182b5b56b", - "text": "Haitian Creole French (Popular) Tout moun sou t\u00e8 a f\u00e8t tou lib. Tout gen menm val\u00e8 (nan je lasosyete), tout moun gen menm dwa devan Lalwa. Tout moun f\u00e8t ak yon bonsans, tout f\u00e8t ak yon konsyans epi youn f\u00e8t pou trete l\u00f2t tankou fr\u00e8 ak s\u00e8.", + "text": "Haitian Creole French (Popular) Tout moun sou tè a fèt tou lib. Tout gen menm valè (nan je lasosyete), tout moun gen menm dwa devan Lalwa. Tout moun fèt ak yon bonsans, tout fèt ak yon konsyans epi youn fèt pou trete lĂ²t tankou frè ak sè.", "metadata": { "languages": [ "fra", @@ -3869,7 +3869,7 @@ { "type": "NarrativeText", "element_id": "100bdd3a0bc9a25394f34018b95871fe", - "text": "Hausa Duk \u2018yan\u2019adan ana haihuwarsu ne a matsayin \u2018yantattun \u2018ya\u2019ya, kuma mutuncinsu da haqqoqinsu daidai yake da na kowa. Suna da tunani da cikakken hankali, saboda haka ake son duk mu\u2019amalar da za su yi, ta kasance akwai \u2018yan\u2019uwantaka a tsakani.", + "text": "Hausa Duk ‘yan’adan ana haihuwarsu ne a matsayin ‘yantattun ‘ya’ya, kuma mutuncinsu da haqqoqinsu daidai yake da na kowa. Suna da tunani da cikakken hankali, saboda haka ake son duk mu’amalar da za su yi, ta kasance akwai ‘yan’uwantaka a tsakani.", "metadata": { "languages": [ "ind", @@ -3891,7 +3891,7 @@ { "type": "NarrativeText", "element_id": "19ff46e13339eab9d9fce6566dad6102", - "text": "Hausa (Niger) Su dai \u01b4an\u2010adam, ana haifuwarsu ne duka \u01b4antattu, kuma kowannensu na da mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin \u01b4an\u2010uwanci.", + "text": "Hausa (Niger) Su dai Æ´anâ€adam, ana haifuwarsu ne duka Æ´antattu, kuma kowannensu na da mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin Æ´anâ€uwanci.", "metadata": { "languages": [ "swa", @@ -3913,7 +3913,7 @@ { "type": "NarrativeText", "element_id": "39fce89f870171ba68c60c4aaaeb5509", - "text": "Hausa (Nigeria) Su dai \u2018yan-adam, ana haifuwarsu ne duka \u2018yantattu, kuma kowannensu na da mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin \u2018yan-uwanci.", + "text": "Hausa (Nigeria) Su dai ‘yan-adam, ana haifuwarsu ne duka ‘yantattu, kuma kowannensu na da mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin ‘yan-uwanci.", "metadata": { "languages": [ "ind", @@ -3935,7 +3935,7 @@ { "type": "NarrativeText", "element_id": "5a888adab3cc776c69ebb4b588db4bfb", - "text": "Hawaiian H\u0101nau k\u016b\u2019oko\u2019a \u2018ia n\u0101 k\u0101naka apau loa, a ua kau like ka hanohano a me n\u0101 pono k\u012bvila ma luna o k\u0101kou p\u0101kahi. Ua ku\u2019u mai ka no\u2019ono\u2019o pono a me ka \u2018ike pono ma luna o k\u0101kou, no laila, e aloha k\u0101kou kekahi i kekahi.", + "text": "Hawaiian HÄnau kū’oko’a ‘ia nÄ kÄnaka apau loa, a ua kau like ka hanohano a me nÄ pono kÄ«vila ma luna o kÄkou pÄkahi. Ua ku’u mai ka no’ono’o pono a me ka ‘ike pono ma luna o kÄkou, no laila, e aloha kÄkou kekahi i kekahi.", "metadata": { "languages": [ "swa", @@ -3957,7 +3957,7 @@ { "type": "NarrativeText", "element_id": "9bce25b61dc4faf00ebf9ae5bedd19aa", - "text": "Hebrew \u05db\u05dc \u05d1\u05e0\u05d9 \u05d0\u05d3\u05dd \u05e0\u05d5\u05dc\u05d3\u05d5 \u05d1\u05e0\u05d9 \u05d7\u05d5\u05e8\u05d9\u05df \u05d5\u05e9\u05d5\u05d5\u05d9\u05dd \u05d1\u05e2\u05e8\u05db\u05dd \u05d5\u05d1\u05d6\u05db\u05d5\u05d9\u05d5\u05ea\u05d9\u05d4\u05dd. \u05db\u05d5\u05dc\u05dd \u05d7\u05d5\u05e0\u05e0\u05d5 \u05d1\u05ea\u05d1\u05d5\u05e0\u05d4 \u05d5\u05d1\u05de\u05e6\u05e4\u05d5\u05df, \u05dc\u05e4\u05d9\u05db\u05da \u05d7\u05d5\u05d1\u05d4 \u05e2\u05dc\u05d9\u05d4\u05dd \u05dc\u05e0\u05d4\u05d5\u05d2 \u05d0\u05d9\u05e9 \u05d1\u05e8\u05e2\u05d4\u05d5 \u05d1\u05e8\u05d5\u05d7 \u05e9\u05dc \u05d0\u05d7\u05d5\u05d4.", + "text": "Hebrew כל בני ××“× × ×•×œ×“×• בני חורין ×•×©×•×•×™× ×‘×¢×¨×›× ×•×‘×–×›×•×™×•×ª×™×”×. ×›×•×œ× ×—×•× × ×• בתבונה וב×צפון, ×œ×¤×™×›× ×—×•×‘×” ×¢×œ×™×”× ×œ× ×”×•×’ ×יש ברעהו ברוח של ×חוה.", "metadata": { "languages": [ "heb" @@ -3999,7 +3999,7 @@ { "type": "UncategorizedText", "element_id": "8af5d2f7586f72942fcfc21e4f9f0e7e", - "text": "Hindi \u0938\u092d\u0940 \u092e\u0928\u0941\u0937\u094d\u092f\u094b\u0902 \u0915\u094b \u0917\u094c\u0930\u0935 \u0914\u0930 \u0905\u0927\u093f\u0915\u093e\u0930\u094b\u0902 \u0915\u0947 \u092e\u093e\u092e\u0932\u0947 \u092e\u0947\u0902 \u091c\u0928\u094d\u092e\u091c\u093e\u0924 \u0938\u094d\u0935\u0924\u0928\u094d\u0924\u094d\u0930\u0924\u093e \u0914\u0930 \u0938\u092e\u093e\u0928\u0924\u093e \u092a\u094d\u0930\u093e\u092a\u094d\u0924 \u0939\u0948 \u0964 \u0909\u0928\u094d\u0939\u0947\u0902 \u092c\u0941\u0926\u094d\u0927\u093f \u0914\u0930 \u0905\u0928\u094d\u0924\u0930\u093e\u0924\u094d\u092e\u093e \u0915\u0940 \u0926\u0947\u0928 \u092a\u094d\u0930\u093e\u092a\u094d\u0924 \u0939\u0948 \u0914\u0930 \u092a\u0930\u0938\u094d\u092a\u0930 \u0909\u0928\u094d\u0939\u0947\u0902 \u092d\u093e\u0908\u091a\u093e\u0930\u0947 \u0915\u0947 \u092d\u093e\u0935 \u0938\u0947 \u092c\u0930\u094d\u0924\u093e\u0935 \u0915\u0930\u0928\u093e \u091a\u093e\u0939\u093f\u090f \u0964", + "text": "Hindi सभी मनà¥à¤·à¥à¤¯à¥‹à¤‚ को गौरव और अधिकारों के मामले में जनà¥à¤®à¤œà¤¾à¤¤ सà¥à¤µà¤¤à¤¨à¥à¤¤à¥à¤°à¤¤à¤¾ और समानता पà¥à¤°à¤¾à¤ªà¥à¤¤ है । उनà¥à¤¹à¥‡à¤‚ बà¥à¤¦à¥à¤§à¤¿ और अनà¥à¤¤à¤°à¤¾à¤¤à¥à¤®à¤¾ की देन पà¥à¤°à¤¾à¤ªà¥à¤¤ है और परसà¥à¤ªà¤° उनà¥à¤¹à¥‡à¤‚ भाईà¤à¤¾à¤°à¥‡ के भाव से बरà¥à¤¤à¤¾à¤µ करना à¤à¤¾à¤¹à¤¿à¤ ।", "metadata": { "languages": [ "hin" @@ -4020,7 +4020,7 @@ { "type": "NarrativeText", "element_id": "b992780a7e7cfec805b61d50bd3cbb25", - "text": "Hindustani, Sarnami Sab djanne aadj\u00e1di aur barabar paidaa bhail\u00e8n, iddjat aur hak m\u00ea. Ohi djanne ke lage sab ke samadj-boedj aur hierdaai hai aur doesare se sab soemmat s\u00e8, djaane-maane ke chaahin.", + "text": "Hindustani, Sarnami Sab djanne aadjĂ¡di aur barabar paidaa bhailèn, iddjat aur hak mĂª. Ohi djanne ke lage sab ke samadj-boedj aur hierdaai hai aur doesare se sab soemmat sè, djaane-maane ke chaahin.", "metadata": { "languages": [ "est", @@ -4110,7 +4110,7 @@ { "type": "NarrativeText", "element_id": "4113619dd86b7bf65f70dd31f3155ce1", - "text": "Huastec (San Lu\u00eds Potos\u00ed) Patal an inik ani an uxum u wa'tsinal walkadh abal junun\u00fal kin bats'uw an alwa'tal\u00e1b ani ka pidhan in \u00e9y jant'ini' in tomn\u00e1l; in kwa'al in tsalp\u00e1dh ani in k'ay\u00e1' abal kin k'anidha' in juntal.", + "text": "Huastec (San LuĂ­s PotosĂ­) Patal an inik ani an uxum u wa'tsinal walkadh abal jununĂºl kin bats'uw an alwa'talĂ¡b ani ka pidhan in Ă©y jant'ini' in tomnĂ¡l; in kwa'al in tsalpĂ¡dh ani in k'ayĂ¡' abal kin k'anidha' in juntal.", "metadata": { "languages": [ "ind", @@ -4133,7 +4133,7 @@ { "type": "NarrativeText", "element_id": "cec56f0f701b47b7615015993ec87eaa", - "text": "Huastec (Sierra de Otontepec) Kuentsal nap wah-ch\u00ednal tee ti chabal jayechek-i antip wah-ch\u00ednal, b\u00e1 tam\u00e1 maxak a pulik maxak in exlal, jununul an\u00ed ni chap an\u00ed jaxtam ko-yal kip le-nax\u00edn an\u00ed ki k-ana ti ba.", + "text": "Huastec (Sierra de Otontepec) Kuentsal nap wah-chĂ­nal tee ti chabal jayechek-i antip wah-chĂ­nal, bĂ¡ tamĂ¡ maxak a pulik maxak in exlal, jununul anĂ­ ni chap anĂ­ jaxtam ko-yal kip le-naxĂ­n anĂ­ ki k-ana ti ba.", "metadata": { "languages": [ "ind", @@ -4180,7 +4180,7 @@ { "type": "NarrativeText", "element_id": "68c1e44b4d3af66e1c5cddb5a8861a91", - "text": "Huitoto, Murui Nana ca\u0268 comuillamona dama ca\u0268 abido it\u0268ca\u0268. Ca\u0268 comuillamona j\u0268a\u0268m\u0268e anamo i\u00f1ed\u0268ca\u0268. Nana daje facaiconi it\u0268ca\u0268. Ab\u0268 ui\u00f1uanona comuid\u0268ca\u0268. Dan\u0268 conin\u0268rie ca\u0268 nabairilla.", + "text": "Huitoto, Murui Nana caɨ comuillamona dama caɨ abido itɨcaɨ. Caɨ comuillamona jɨaɨmɨe anamo iñedɨcaɨ. Nana daje facaiconi itɨcaɨ. Abɨ uiñuanona comuidɨcaɨ. Danɨ coninɨrie caɨ nabairilla.", "metadata": { "languages": [ "ita", @@ -4202,7 +4202,7 @@ { "type": "NarrativeText", "element_id": "35c2ba2ee3067a7d3d5509a2f11f8123", - "text": "Hungarian Minden. emberi l\u00e9ny szabadon sz\u00fcletik \u00e9s egyenl\u0151 m\u00e9lt\u00f3s\u00e1ga \u00e9s joga van. Az emberek, \u00e9sszel \u00e9s lelkiismerettel b\u00edrv\u00e1n, egym\u00e1ssal szemben testv\u00e9ri szellemben kell hogy viseltessenek.", + "text": "Hungarian Minden. emberi lĂ©ny szabadon szĂ¼letik Ă©s egyenlÅ‘ mĂ©ltĂ³sĂ¡ga Ă©s joga van. Az emberek, Ă©sszel Ă©s lelkiismerettel bĂ­rvĂ¡n, egymĂ¡ssal szemben testvĂ©ri szellemben kell hogy viseltessenek.", "metadata": { "languages": [ "hun" @@ -4246,7 +4246,7 @@ { "type": "NarrativeText", "element_id": "d1120c74094e3c70d2191f6d40987753", - "text": "Icelandic Hver ma\u00f0ur er borinn frj\u00e1ls og jafn \u00f6\u00f0rum a\u00f0 vir\u00f0ingu og r\u00e9ttindum. Menn eru g\u00e6ddir vitsmunum og samvizku, og ber \u00feeim a\u00f0 breyta br\u00f3\u00f0urlega hverjum vi\u00f0 annan.", + "text": "Icelandic Hver maður er borinn frjĂ¡ls og jafn öðrum að virðingu og rĂ©ttindum. Menn eru gæddir vitsmunum og samvizku, og ber Ă¾eim að breyta brĂ³Ă°urlega hverjum við annan.", "metadata": { "languages": [ "nor" @@ -4289,7 +4289,7 @@ { "type": "NarrativeText", "element_id": "c061731c2409f1d04154bcb99040df32", - "text": "Idoma \u0118g\u0119 ni modudu ac\u0119 k\u0119c\u0119 nya b\u0119c\u0119 \u0119hehi aa ,hibi \u0119g\u037b ma ac\u0119 duu jonjil\u0119 ipu koc\u0119gba n\u037bc\u0119 c\u0119gba m\u0119ml\u2019ojonjil\u0119 ipu \u037bdah ni yab\u037b \u037bc\u0119 nya. Odudu ac\u0119 kwu \u0452wule ml\u2019ohili otu m\u0119ml\u2019ocai k\u0119la j\u037bc\u0119 \u037bha ni yipu \u037btu \u037bc\u0119 aa, higb\u037b ma \u037bc\u0119 higbo y\u037bda m\u0119ml\u2019 \u037bmpa gunu l\u0119 b\u037bin\u0119 nu ma.", + "text": "Idoma ĘgÄ™ ni modudu acÄ™ kÄ™cÄ™ nya bÄ™cÄ™ Ä™hehi aa ,hibi Ä™gÍ» ma acÄ™ duu jonjilÄ™ ipu kocÄ™gba nÍ»cÄ™ cÄ™gba mÄ™ml’ojonjilÄ™ ipu Í»dah ni yabÍ» Í»cÄ™ nya. Odudu acÄ™ kwu Ñ’wule ml’ohili otu mÄ™ml’ocai kÄ™la jÍ»cÄ™ Í»ha ni yipu Í»tu Í»cÄ™ aa, higbÍ» ma Í»cÄ™ higbo yÍ»da mÄ™ml’ Í»mpa gunu lÄ™ bÍ»inÄ™ nu ma.", "metadata": { "languages": [ "swa" @@ -4310,7 +4310,7 @@ { "type": "NarrativeText", "element_id": "c3dc3590b2338d3585c67664e25eb878", - "text": "Igbo A m\u1ee5r\u1ee5 mmad\u1ee5 nile n'ohere nakwa nha anya ugwu na ikike. E nyere ha uche na mm\u1ee5\u1ecd ime ihe ziri ezi nke na ha kwesiri \u1ecbkpaso ibe ha agwa n'obi nwanne na nwanne.", + "text": "Igbo A mụrụ mmadụ nile n'ohere nakwa nha anya ugwu na ikike. E nyere ha uche na mmụỠime ihe ziri ezi nke na ha kwesiri ịkpaso ibe ha agwa n'obi nwanne na nwanne.", "metadata": { "languages": [ "swa" @@ -4331,7 +4331,7 @@ { "type": "NarrativeText", "element_id": "050a0685e37c5cdf1484af7fb81846c0", - "text": "Ijo, Southeast Kim\u2019 owoumo se, keni bara ki na, pa zimi, ose keni bara kemi. Kim\u2019se ye iroro, mani ikiou nana, enini kim\u2019se dudu tari teme nana weri iyenri.", + "text": "Ijo, Southeast Kim’ owoumo se, keni bara ki na, pa zimi, ose keni bara kemi. Kim’se ye iroro, mani ikiou nana, enini kim’se dudu tari teme nana weri iyenri.", "metadata": { "languages": [ "swa", @@ -4419,7 +4419,7 @@ { "type": "NarrativeText", "element_id": "c08152bc9c1cbc1930714b7051e6100a", - "text": "Inuktitut, Eastern Canadian \u1403\u14c5\u152a\u14d5\u14ab\u1466 \u140a\u14c2\u1585\u144e\u1546\u152a\u14d5\u14ab\u1466 \u1403\u14c5\u14da\u1405\u1550\u14aa\u1455 \u1403\u14f1\u14aa\u1550\u14f1\u1550\u15a2\u144e\u1483 \u140a\u14bb\u14aa\u14d7 \u140a\u153e\u1528\u1405\u1583\u144e\u148c\u1483\u15a2\u144e\u1483 \u14c2\u1550\u14f1\u140a\u1591\u14c2\u1483\u146f\u1466 \u140a\u14bb\u14aa\u14d7 \u1431\u152a\u14d0\u14c7\u1403\u144e\u144e\u148d\u1466. \u1403\u14f1\u1583\u1585\u1450\u1581\u144e\u1583\u1550\u144e\u1455\u1405\u1559\u14d5\u1550\u1433\u1466 \u1431\u153e\u152a\u144e\u1583\u1550\u14c2\u1483\u146f\u1466 \u1583\u1455\u1673\u144e\u148c\u1466\u144e\u140a\u1546\u140a\u1583\u1550\u14c2\u1483\u146f\u14ea\u14d7.", + "text": "Inuktitut, Eastern Canadian áƒá“…ᔪᓕᒫᑦ áá“‚á–…á‘ᕆᔪᓕᒫᑦ áƒá“…á“á…á•ᒪᑕ áƒá“±á’ªá•ᓱá•á–¢á‘á’ƒ áᒻᒪᓗ áᔾᔨá…á–ƒá‘ᒌᒃᖢá‘á’ƒ á“‚á•ᓱáᖑᓂᒃᑯᑦ áᒻᒪᓗ á±á”ªá“ᓇáƒá‘á‘á’ᑦ. áƒá“±á–ƒá–…á‘á–á‘á–ƒá•á‘á‘•á…ᕙᓕá•á³á‘¦ á±á”¾á”ªá‘á–ƒá•ᓂᒃᑯᑦ ᖃᑕᙳá‘ᒌᑦá‘áᕆáá–ƒá•ᓂᒃᑯᓪᓗ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -4458,7 +4458,7 @@ { "type": "NarrativeText", "element_id": "6e8030f949832ac1e4d5632bc1a06b48", - "text": "Italian Tutti gli esseri umani nascono liberi ed eguali in dignit\u00e0 e diritti. Essi sono dotati di ragione e di coscienza e devono agire gli uni verso gli altri in spirito di fratellanza.", + "text": "Italian Tutti gli esseri umani nascono liberi ed eguali in dignitĂ  e diritti. Essi sono dotati di ragione e di coscienza e devono agire gli uni verso gli altri in spirito di fratellanza.", "metadata": { "languages": [ "ita" @@ -4500,7 +4500,7 @@ { "type": "Title", "element_id": "57bbff46bb89b26b933206afe0fd8904", - "text": "\u3059\u3079\u3066\u306e\u4eba\u9593\u306f\u3001\u751f\u307e\u308c\u306a\u304c\u3089\u306b\u3057\u3066\u81ea\u7531\u3067\u3042\u308a\u3001\u304b\u3064\u3001\u5c0a\u53b3\u3068\u6a29\u5229\u3068\u306b\u3064\u3044\u3066\u5e73\u7b49\u3067\u3042\u308b\u3002\u4eba\u9593\u306f\u3001\u7406\u6027\u3068\u826f\u5fc3\u3068\u3092\u6388\u3051\u3089\u308c\u3066\u304a\u308a\u3001\u4e92\u3044\u306b\u540c\u80de\u306e\u7cbe\u795e\u3092\u3082\u3063\u3066\u884c\u52d5\u3057\u306a\u3051\u308c\u3070\u306a\u3089\u306a\u3044\u3002", + "text": "ă™ă¹ă¦ă®äººé–“ă¯ă€ç”Ÿă¾ă‚ŒăªăŒă‚‰ă«ă—ă¦è‡ªç”±ă§ă‚ă‚ă€ă‹ă¤ă€å°å³ă¨æ¨©åˆ©ă¨ă«ă¤ă„ă¦å¹³ç­‰ă§ă‚ă‚‹ă€‚äººé–“ă¯ă€ç†æ€§ă¨è‰¯å¿ƒă¨ă‚’æˆă‘ă‚‰ă‚Œă¦ăă‚ă€äº’ă„ă«åŒèƒă®ç²¾ç¥ă‚’ă‚‚ă£ă¦è¡Œå‹•ă—ăªă‘ă‚Œă°ăªă‚‰ăªă„。", "metadata": { "languages": [ "jpn" @@ -4542,7 +4542,7 @@ { "type": "Title", "element_id": "11becf872133958b85928710255eb2cc", - "text": "\u3059\u3079\u3066\u306e\u4eba\u9593\u306f\u3001\u751f\u307e\u308c\u306a\u304c\u3089\u306b\u3057\u3066\u81ea\u7531\u3084\u3057\u3001\u304b\u3064\u3001\u5c0a\u53b3\u3068\u6a29\u5229\u3068\u306b\u3064\u3044\u3066\u5e73\u7b49\u3084\u3002\u4eba\u9593\u306f\u3001\u7406\u6027\u3068\u826f\u5fc3\u3068\u3092\u6388\u3051\u3089\u308c\u3066\u304a\u308a\u3001\u4e92\u3044\u306b\u540c\u80de\u306e\u7cbe\u795e\u3092\u3082\u3063\u3066\u884c\u52d5\u3057\u306a\u3002", + "text": "ă™ă¹ă¦ă®äººé–“ă¯ă€ç”Ÿă¾ă‚ŒăªăŒă‚‰ă«ă—ă¦è‡ªç”±ă‚„ă—ă€ă‹ă¤ă€å°å³ă¨æ¨©åˆ©ă¨ă«ă¤ă„ă¦å¹³ç­‰ă‚„ă€‚äººé–“ă¯ă€ç†æ€§ă¨è‰¯å¿ƒă¨ă‚’æˆă‘ă‚‰ă‚Œă¦ăă‚ă€äº’ă„ă«åŒèƒă®ç²¾ç¥ă‚’ă‚‚ă£ă¦è¡Œå‹•ă—ăªă€‚", "metadata": { "languages": [ "jpn" @@ -4584,7 +4584,7 @@ { "type": "Title", "element_id": "491550640c5496ae9b9e41b4c6cc14f0", - "text": "\u5168\u90e8\u306e\u4eba\u9593\u306f\u3001\u751f\u307e\u308c\u306a\u304c\u3089\u306b\u3057\u3066\u81ea\u7531\u3067\u3042\u308a\u3001\u304b\u3064\u3001\u5c0a\u53b3\u3068\u6a29\u5229\u3068 \u306b\u3064\u3044\u3066\u5e73\u7b49\u3067\u3042\u308b\u3002\u4eba\u9593\u306f\u3001\u7406\u6027\u3068\u826f\u5fc3\u3068\u3092\u6388\u3051\u3089\u308c\u3066\u304a\u308a\u3001\u4e92\u3044\u306b\u540c \u80de\u306e\u7cbe\u795e\u3092\u3082\u3063\u3066\u884c\u52d5\u3057\u306a\u3051\u308c\u3070\u306a\u3089\u306a\u3044\u3002", + "text": "全部ă®äººé–“ă¯ă€ç”Ÿă¾ă‚ŒăªăŒă‚‰ă«ă—ă¦è‡ªç”±ă§ă‚ă‚ă€ă‹ă¤ă€å°å³ă¨æ¨©åˆ©ă¨ ă«ă¤ă„ă¦å¹³ç­‰ă§ă‚ă‚‹ă€‚äººé–“ă¯ă€ç†æ€§ă¨è‰¯å¿ƒă¨ă‚’æˆă‘ă‚‰ă‚Œă¦ăă‚ă€äº’ă„ă«åŒ èƒă®ç²¾ç¥ă‚’ă‚‚ă£ă¦è¡Œå‹•ă—ăªă‘ă‚Œă°ăªă‚‰ăªă„。", "metadata": { "languages": [ "jpn" @@ -4626,7 +4626,7 @@ { "type": "Title", "element_id": "36abfab21253834165ada6ce4b89b5e6", - "text": "\ua9cb\ua9b1\ua9a7\ua9bc\ua9a4\ua9c0\ua9b2\ua9b8\ua9ae\ua9ba\ua9b4\ua981\ua98f\ua9ad\ua9b2\ua9b6\ua982\ua9ab\ua98f\ua9ba\ua98f\ua9a4\ua9c0\ua99b\ua9b6\ua9a9\ua982\ua9a2\ua9b6\ua98f\ua9ad\ua9a4\ua9c0\ua9a2\ua982\ua9a7\ua9ba\ua9a9\ua982\ua9a0\ua9a7\ua9a0\ua9c0\ua9ad\ua9a4\ua9c0\ua9b2\ua98f\ua9c0\ua9b2\ua98f\ua9c0\ua98f\ua981\ua9a5\ua99d\ua9c9\u200b\ua98f\ua9a7\ua9ba\ua983\ua9a5\ua9b6\ua9a4\ua9ab\ua9b6\ua981\ua994\ua9a4\ua9c0\ua9b2\ua98f\ua9ad\ua9c0\ua9ad\ua9a4\ua9c0\ua98f\ua9ad\ua9c0\ua9a7\ua9b8\ua9b1\ua982\ua9a0\ua98f\ua9b2\ua997\ua9a7\ua9c0\ua9a5\ua9b1\ua9bf\ua9ae\ua9b8\ua981\ua994\ua9a4\ua9c0\ua9b2\ua981\ua992\ua9ba\ua9b4\ua9a4\ua9c0\ua9a4\ua9ba\ua9a9\ua9bc\ua9a9\ua9b6\ua9a0\ua9bf\ua9a4\ua9c0\ua9b1\ua9b6\ua997\ua9b6\ua9ad\ua9a4\ua9c0\ua9b1\ua9b6\ua997\ua9b6\ua9a4\ua9ba\ua98f\ua9a4\ua9c0\ua99b\ua9b6\ua997\ua9b6\ua9ae\ua9ba\ua9b4\ua9b1\ua9b8\ua9a9\ua9a2\ua9b8\ua9ad\ua9b8\ua982\ua9c9\u200b", + "text": "꧋ꦱꦧꦼꦤ꧀ꦲꦸꦮꦺꦴê¦ê¦ê¦­ê¦²ê¦¶ê¦‚ꦫê¦ê¦ºê¦ê¦¤ê§€ê¦›ê¦¶ê¦©ê¦‚ꦢꦶê¦ê¦­ê¦¤ê§€ê¦¢ê¦‚ꦧꦺꦩꦂꦠꦧꦠ꧀ꦭꦤ꧀ꦲê¦ê§€ê¦²ê¦ê§€ê¦ê¦ê¦¥ê¦ê§‰â€‹ê¦ê¦§ê¦ºê¦ƒê¦¥ê¦¶ê¦¤ê¦«ê¦¶ê¦ê¦”ꦤ꧀ꦲê¦ê¦­ê§€ê¦­ê¦¤ê§€ê¦ê¦­ê§€ê¦§ê¦¸ê¦±ê¦‚ꦠê¦ê¦²ê¦—ꦧ꧀ꦥꦱꦿꦮꦸê¦ê¦”ꦤ꧀ꦲê¦ê¦’ꦺꦴꦤ꧀ꦤꦺꦩꦼꦩꦶꦠꦿꦤ꧀ꦱꦶꦗꦶꦭꦤ꧀ꦱꦶꦗꦶꦤꦺê¦ê¦¤ê§€ê¦›ê¦¶ê¦—ꦶꦮꦺꦴꦱꦸꦩꦢꦸꦭꦸꦂ꧉​", "metadata": { "filetype": "text/plain", "data_source": { @@ -4665,7 +4665,7 @@ { "type": "NarrativeText", "element_id": "6b54f0a53f2c7bb4545835a761d4654b", - "text": "Jola-Fonyi Bukanak b\u00farom nan kuwolimi kurere kererer di waafaw b\u00farom. Kubabaj poop b\u00fayejet di karampenoor.", + "text": "Jola-Fonyi Bukanak bĂºrom nan kuwolimi kurere kererer di waafaw bĂºrom. Kubabaj poop bĂºyejet di karampenoor.", "metadata": { "languages": [ "ind", @@ -4687,7 +4687,7 @@ { "type": "NarrativeText", "element_id": "b2c33dfdb2855a8786e1145a6dbbedc2", - "text": "Jula W\u00f3lo\u2019 l\u00e1, h\u00e1damaden\u2019 b\u025b\u025b ye h\u0254r\u0254n ye, b\u025b\u025b k\u00e1 k\u00e1n l\u00e0nbe n\u00ed h\u00e1k\u025byaw l\u00e1. M\u0254g\u0254 b\u025b\u025b ye h\u00e1kilitigi ye, b\u025b\u025b ye h\u00e1kilima ye ; \u00f2 l\u00e0, \u00f9 k\u00e1 k\u00e1n k\u00e0 \u0272g\u0254n m\u00edna n\u00ed b\u00e1denya ye.", + "text": "Jula WĂ³lo’ lĂ¡, hĂ¡damaden’ bɛɛ ye hÉ”rÉ”n ye, bɛɛ kĂ¡ kĂ¡n lĂ nbe nĂ­ hĂ¡kÉ›yaw lĂ¡. MÉ”gÉ” bɛɛ ye hĂ¡kilitigi ye, bɛɛ ye hĂ¡kilima ye ; Ă² lĂ , Ă¹ kĂ¡ kĂ¡n kĂ  ɲgÉ”n mĂ­na nĂ­ bĂ¡denya ye.", "metadata": { "languages": [ "hun", @@ -4732,7 +4732,7 @@ { "type": "NarrativeText", "element_id": "03b0bbddb1137224b43b690dfcc5b506", - "text": "Kabardian \u0426\u04cf\u044b\u0445\u0443 \u043f\u0441\u043e\u0440\u0438 \u0449\u0445\u044c\u044d\u0445\u0443\u0438\u0442\u0443, \u044f \u0449\u04cf\u044b\u0445\u044c\u044b\u043c\u0440\u044d \u044f \u0445\u0443\u044d\u0444\u0430\u0449\u044d\u0445\u044d\u043c\u0440\u044d\u043a\u04cf\u044d \u0437\u044d\u0445\u0443\u044d\u0434\u044d\u0443 \u043a\u044a\u0430\u043b\u044a\u0445\u0443\u0440. \u0410\u043a\u044a\u044b\u043b\u0440\u044d \u0437\u044d\u0445\u044d\u0449\u04cf\u044b\u043a\u04cf \u0433\u044a\u0443\u0430\u0437\u044d\u0440\u044d \u044f\u04cf\u044d\u0449\u0438, \u0437\u044b\u0440 \u0437\u044b\u043c \u0437\u044d\u043a\u044a\u0443\u044d\u0448 \u0437\u044d\u0445\u0430\u0449\u0406\u044d \u044f\u043a\u0443 \u0434\u044d\u043b\u044a\u0443 \u0437\u044d\u0445\u0443\u0449\u044b\u0442\u044b\u043d \u0445\u0443\u0435\u0439\u0445\u044d\u0449.", + "text": "Kabardian ЦÓыху Đ¿ÑĐ¾Ñ€Đ¸ щхьÑÑ…ÑƒĐ¸Ñ‚Ñƒ, Ñ Ñ‰ÓÑ‹Ñ…ÑŒÑ‹Đ¼Ñ€Ñ Ñ Ñ…ÑƒÑÑ„Đ°Ñ‰ÑÑ…ÑĐ¼Ñ€ÑĐºÓÑ Đ·ÑхуÑĐ´Ñу ĐºÑалÑхур. ĐĐºÑÑ‹Đ»Ñ€Ñ Đ·ÑÑ…ÑщÓÑ‹ĐºÓ Đ³ÑÑƒĐ°Đ·ÑÑ€Ñ ÑÓÑÑ‰Đ¸, Đ·Ñ‹Ñ€ Đ·Ñ‹Đ¼ Đ·ÑĐºÑуÑш Đ·ÑÑ…Đ°Ñ‰Đ†Ñ ÑĐºÑƒ Đ´ÑĐ»Ñу Đ·ÑÑ…ÑƒÑ‰Ñ‹Ñ‚Ñ‹Đ½ Ñ…ÑƒĐµĐ¹Ñ…Ñщ.", "metadata": { "languages": [ "rus" @@ -4753,7 +4753,7 @@ { "type": "NarrativeText", "element_id": "5da5e2f597a0e6fce26a5359c72395b3", - "text": "Kabiy\u00e9 Pal\u028al\u028a\u028a \u025byaaa n\u025b pa-t\u0269 y\u0254\u0254 w\u025b\u028a kpaagbaa n\u025b p\u025bw\u025b\u025b k\u0269ma\u014b wala \u025bs\u0269ndaa. Pal\u028al\u028a\u028a-w\u025b n\u025b p\u0254-l\u0254\u014b n\u025b pa-ma\u0263z\u0269m; mb\u028a yekina n\u025b p\u0254s\u0254\u0254l\u0269 \u0256ama se p\u025bk\u025b \u025byaa pa-t\u0269\u014bg\u025b.", + "text": "KabiyĂ© PalÊlÊÊ É›yaaa nÉ› pa-tÉ© yɔɔ wÉ›Ê kpaagbaa nÉ› pÉ›wɛɛ kÉ©maÅ‹ wala É›sÉ©ndaa. PalÊlÊÊ-wÉ› nÉ› pÉ”-lɔŋ nÉ› pa-maÉ£zÉ©m; mbÊ yekina nÉ› pÉ”sɔɔlÉ© É–ama se pÉ›kÉ› É›yaa pa-tɩŋgÉ›.", "metadata": { "languages": [ "tgl" @@ -4774,7 +4774,7 @@ { "type": "NarrativeText", "element_id": "b1298a59ae52d3a285db4b52acce1f32", - "text": "Kabuverdianu Tudo ser humano na \u00eas mundo nac\u00ea libri e igual na s\u00ea dignidade e na s\u00eas dr\u00eato. Na s\u00eas razon e na s\u00eas conc\u00e9n\u00e7a, tudo arguem deb\u00ea porc\u00ead\u00ea pa co tudo guenti na sprito di fraternidadi.", + "text": "Kabuverdianu Tudo ser humano na Ăªs mundo nacĂª libri e igual na sĂª dignidade e na sĂªs drĂªto. Na sĂªs razon e na sĂªs concĂ©nça, tudo arguem debĂª porcĂªdĂª pa co tudo guenti na sprito di fraternidadi.", "metadata": { "languages": [ "por" @@ -4816,7 +4816,7 @@ { "type": "NarrativeText", "element_id": "f6f8a776d36f4db6ffdd50e83fee6488", - "text": "Kannada \u0c8e\u0cb2\u0ccd\u0cb2\u0cbe \u0cae\u0cbe\u0ca8\u0cb5\u0cb0\u0cc2 \u0cb8\u0ccd\u0cb5\u0ca4\u0c82\u0ca4\u0ccd\u0cb0\u0cb0\u0cbe\u0c97\u0cbf\u0caf\u0cc7 \u0c9c\u0ca8\u0cbf\u0cb8\u0cbf\u0ca6\u0ccd\u0ca6\u0cbe\u0cb0\u0cc6. \u0cb9\u0cbe\u0c97\u0cc2 \u0c98\u0ca8\u0ca4\u0cc6 \u0cae\u0ca4\u0ccd\u0ca4\u0cc1 \u0cb9\u0c95\u0ccd\u0c95\u0cc1\u0c97\u0cb3\u0cb2\u0ccd\u0cb2\u0cbf \u0cb8\u0cae\u0cbe\u0ca8\u0cb0\u0cbe\u0c97\u0cbf\u0ca6\u0ccd\u0ca6\u0cbe\u0cb0\u0cc6. \u0cb5\u0cbf\u0cb5\u0cc7\u0c95 \u0cae\u0ca4\u0ccd\u0ca4\u0cc1 \u0c85\u0c82\u0ca4\u0c83\u0c95\u0cb0\u0ca3\u0c97\u0cb3\u0ca8\u0ccd\u0ca8\u0cc1 \u0caa\u0ca1\u0cc6\u0ca6\u0cb5\u0cb0\u0cbe\u0ca6\u0ccd\u0ca6\u0cb0\u0cbf\u0c82\u0ca6 \u0c85\u0cb5\u0cb0\u0cc1 \u0caa\u0cb0\u0cb8\u0ccd\u0caa\u0cb0 \u0cb8\u0cb9\u0ccb\u0ca6\u0cb0 \u0cad\u0cbe\u0cb5\u0ca6\u0cbf\u0c82\u0ca6 \u0cb5\u0cb0\u0ccd\u0ca4\u0cbf\u0cb8\u0cac\u0cc7\u0c95\u0cc1.", + "text": "Kannada à²à²²à³à²²à²¾ ಮಾನವರೂ ಸà³à²µà²¤à²‚ತà³à²°à²°à²¾à²—ಿಯೇ ಜನಿಸಿದà³à²¦à²¾à²°à³†. ಹಾಗೂ ಘನತೆ ಮತà³à²¤à³ ಹಕà³à²•à³à²—ಳಲà³à²²à²¿ ಸಮಾನರಾಗಿದà³à²¦à²¾à²°à³†. ವಿವೇಕ ಮತà³à²¤à³ ಅಂತಃಕರಣಗಳನà³à²¨à³ ಪಡೆದವರಾದà³à²¦à²°à²¿à²‚ದ ಅವರೠಪರಸà³à²ªà²° ಸಹೋದರ ಭಾವದಿಂದ ವರà³à²¤à²¿à²¸à²¬à³‡à²•à³.", "metadata": { "languages": [ "kan" @@ -4837,7 +4837,7 @@ { "type": "NarrativeText", "element_id": "2600735e35ce8a6dc8243d2269bbeee5", - "text": "Kanuri, Central Adamgana woso kambe katambo ye daraja-a hakkiwa-ason kalkalye. Hankal-a nazaru-asoro k\u0259z\u0259pk\u0259 ye suro hal n\u0259mharamiben kamazasoga letaiyin ye.", + "text": "Kanuri, Central Adamgana woso kambe katambo ye daraja-a hakkiwa-ason kalkalye. Hankal-a nazaru-asoro kÉ™zÉ™pkÉ™ ye suro hal nÉ™mharamiben kamazasoga letaiyin ye.", "metadata": { "languages": [ "swa", @@ -4884,7 +4884,7 @@ { "type": "NarrativeText", "element_id": "2e5fe352907c2d71abf3a0283032775f", - "text": "Kaqchikel, Central Konojel ri winaqi' kan kalaxib'en pe ri kolotaj\u00efk, ri junan kiq'ij, ri junan kejqalen, junan kich'ojib'al pa kik'aslen, xa achi'el k'a ri kik'ojlen, ri kinojib'al kichajin xa tik'amun k'a chi nimal\u00e4j konojel xtikajo' ki'.", + "text": "Kaqchikel, Central Konojel ri winaqi' kan kalaxib'en pe ri kolotajĂ¯k, ri junan kiq'ij, ri junan kejqalen, junan kich'ojib'al pa kik'aslen, xa achi'el k'a ri kik'ojlen, ri kinojib'al kichajin xa tik'amun k'a chi nimaläj konojel xtikajo' ki'.", "metadata": { "languages": [ "slv", @@ -4907,7 +4907,7 @@ { "type": "NarrativeText", "element_id": "23d27d0652af0739dbaa674e88fc9ae4", - "text": "Karakalpak \u04b2\u04d9\u043c\u043c\u0435 \u0430\u0434\u0430\u043c\u043b\u0430\u0440 \u04e9\u0437 \u049b\u04d9\u0434\u0438\u0440-\u049b\u044b\u043c\u0431\u0430\u0442\u044b \u0436\u04d9\u043d\u0435 \u04b3\u0443\u049b\u044b\u049b\u043b\u0430\u0440\u044b\u043d\u0434\u0430 \u0435\u0440\u043a\u0438\u043d \u04b3\u04d9\u043c \u0442\u0435\u04a3 \u0431\u043e\u043b\u044b\u043f \u0442\u0443\u045e\u044b\u043b\u0430\u0434\u044b. \u041e\u043b\u0430\u0440\u0493\u0430 \u0430\u049b\u044b\u043b \u04b3\u04d9\u043c \u04b3\u04af\u0436\u0434\u0430\u043d \u0431\u0435\u0440\u0438\u043b\u0433\u0435\u043d \u0431\u043e\u043b\u044b\u043f, \u0431\u0438\u0440-\u0431\u0438\u0440\u0438\u043d\u0435 \u0442\u0443\u045e\u044b\u0441\u049b\u0430\u043d\u043b\u044b\u049b \u0440\u0443\u045e\u0445\u044b\u043d\u0434\u0430\u0493\u044b \u049b\u0430\u0442\u043d\u0430\u0441\u0442\u0430 \u0431\u043e\u043b\u044b\u045e\u044b \u0442\u0438\u0439\u0438\u0441.", + "text": "Karakalpak ̉²Ó™Đ¼Đ¼Đµ Đ°Đ´Đ°Đ¼Đ»Đ°Ñ€ Ó©Đ· ̉›Ó™Đ´Đ¸Ñ€-̉›Ñ‹Đ¼Đ±Đ°Ñ‚Ñ‹ Đ¶Ó™Đ½Đµ ̉³Ñƒ̉›Ñ‹̉›Đ»Đ°Ñ€Ñ‹Đ½Đ´Đ° ĐµÑ€ĐºĐ¸Đ½ ̉³Ó™Đ¼ Ñ‚Đµ̉£ Đ±Đ¾Đ»Ñ‹Đ¿ туÑÑ‹Đ»Đ°Đ´Ñ‹. ĐĐ»Đ°Ñ€̉“а а̉›Ñ‹Đ» ̉³Ó™Đ¼ ̉³̉¯Đ¶Đ´Đ°Đ½ Đ±ĐµÑ€Đ¸Đ»Đ³ĐµĐ½ Đ±Đ¾Đ»Ñ‹Đ¿, Đ±Đ¸Ñ€-Đ±Đ¸Ñ€Đ¸Đ½Đµ туÑÑ‹Ñ̉›Đ°Đ½Đ»Ñ‹̉› руÑÑ…Ñ‹Đ½Đ´Đ°̉“Ñ‹ ̉›Đ°Ñ‚Đ½Đ°ÑÑ‚Đ° Đ±Đ¾Đ»Ñ‹ÑÑ‹ Ñ‚Đ¸Đ¹Đ¸Ñ.", "metadata": { "languages": [ "rus" @@ -4928,7 +4928,7 @@ { "type": "NarrativeText", "element_id": "c6f580433e84639a19b178da5dc4b3a2", - "text": "Karelian Kai rahvas roittahes v\u00e4llinny da taza-arvozinnu omas arvos da oigevuksis. Jogahizele heis on annettu mieli da omatundo da heil v\u00e4lt\u00e4m\u00e4tt\u00e4h pid\u00e4y olla kesken\u00e4h, kui vellil.", + "text": "Karelian Kai rahvas roittahes vällinny da taza-arvozinnu omas arvos da oigevuksis. Jogahizele heis on annettu mieli da omatundo da heil vältämättäh pidäy olla keskenäh, kui vellil.", "metadata": { "languages": [ "est", @@ -4950,7 +4950,7 @@ { "type": "NarrativeText", "element_id": "87e368f61c4a1ba6e0a5743d4d2d41b2", - "text": "Kasem Ba loge n\u0254\u0254na maama se ba taa ye bedwe mo ba \u014bwea de ba chega seini, ye fefeo teira k\u0254taa. W\u025b p\u025b ba swa de bobo\u014ba mo se ba taa ye nubiu daane ye ba jege da \u014bwa\u014ba.", + "text": "Kasem Ba loge nɔɔna maama se ba taa ye bedwe mo ba Å‹wea de ba chega seini, ye fefeo teira kÉ”taa. WÉ› pÉ› ba swa de boboÅ‹a mo se ba taa ye nubiu daane ye ba jege da Å‹waÅ‹a.", "metadata": { "languages": [ "som", @@ -4972,7 +4972,7 @@ { "type": "NarrativeText", "element_id": "1908a740d8aedadb521f39432a6cbed8", - "text": "Kazakh \u0411\u0430\u0440\u043b\u044b\u049b \u0430\u0434\u0430\u043c\u0434\u0430\u0440 \u0442\u0443\u043c\u044b\u0441\u044b\u043d\u0430\u043d \u0430\u0437\u0430\u0442 \u0436\u04d9\u043d\u0435 \u049b\u0430\u0434\u0456\u0440\u2010\u049b\u0430\u0441\u0438\u0435\u0442\u0456 \u043c\u0435\u043d \u043a\u04b1\u049b\u044b\u049b\u0442\u0430\u0440\u044b \u0442\u0435\u04a3 \u0431\u043e\u043b\u044b\u043f \u0434\u04af\u043d\u0438\u0435\u0433\u0435 \u043a\u0435\u043b\u0435\u0434\u0456. \u0410\u0434\u0430\u043c\u0434\u0430\u0440\u0493\u0430 \u0430\u049b\u044b\u043b\u2010\u043f\u0430\u0440\u0430\u0441\u0430\u0442, \u0430\u0440\u2010\u043e\u0436\u0434\u0430\u043d \u0431\u0435\u0440\u0456\u043b\u0433\u0435\u043d, \u0441\u043e\u043d\u0434\u044b\u049b\u0442\u0430\u043d \u043e\u043b\u0430\u0440 \u0431\u0456\u0440\u2010\u0431\u0456\u0440\u0456\u043c\u0435\u043d \u0442\u0443\u044b\u0441\u0442\u044b\u049b, \u0431\u0430\u0443\u044b\u0440\u043c\u0430\u043b\u0434\u044b\u049b \u049b\u0430\u0440\u044b\u043c\u2010\u049b\u0430\u0442\u044b\u043d\u0430\u0441 \u0436\u0430\u0441\u0430\u0443\u043b\u0430\u0440\u044b \u0442\u0438\u0456\u0441.", + "text": "Kazakh Đ‘Đ°Ñ€Đ»Ñ‹̉› Đ°Đ´Đ°Đ¼Đ´Đ°Ñ€ Ñ‚ÑƒĐ¼Ñ‹ÑÑ‹Đ½Đ°Đ½ Đ°Đ·Đ°Ñ‚ Đ¶Ó™Đ½Đµ ̉›Đ°Đ´Ñ–Ñ€â€̉›Đ°ÑĐ¸ĐµÑ‚Ñ– Đ¼ĐµĐ½ Đº̉±̉›Ñ‹̉›Ñ‚Đ°Ñ€Ñ‹ Ñ‚Đµ̉£ Đ±Đ¾Đ»Ñ‹Đ¿ Đ´̉¯Đ½Đ¸ĐµĐ³Đµ ĐºĐµĐ»ĐµĐ´Ñ–. ĐĐ´Đ°Đ¼Đ´Đ°Ñ€̉“а а̉›Ñ‹Đ»â€Đ¿Đ°Ñ€Đ°ÑĐ°Ñ‚, Đ°Ñ€â€Đ¾Đ¶Đ´Đ°Đ½ Đ±ĐµÑ€Ñ–Đ»Đ³ĐµĐ½, ÑĐ¾Đ½Đ´Ñ‹̉›Ñ‚Đ°Đ½ Đ¾Đ»Đ°Ñ€ Đ±Ñ–Ñ€â€Đ±Ñ–Ñ€Ñ–Đ¼ĐµĐ½ туыÑты̉›, Đ±Đ°ÑƒÑ‹Ñ€Đ¼Đ°Đ»Đ´Ñ‹̉› ̉›Đ°Ñ€Ñ‹Đ¼â€̉›Đ°Ñ‚Ñ‹Đ½Đ°Ñ Đ¶Đ°ÑĐ°ÑƒĐ»Đ°Ñ€Ñ‹ Ñ‚Đ¸Ñ–Ñ.", "metadata": { "languages": [ "ukr", @@ -4994,7 +4994,7 @@ { "type": "NarrativeText", "element_id": "75b6a6751bcdf3ddfc1745d8e7118815", - "text": "Khakas \u041f\u043e\u043b\u0493\u0430\u043d \u043d\u0430 \u043a\u0456\u0437\u0456 \u043f\u043e\u0441 \u043f\u0430\u0437\u0430 \u0442\u0438\u04a3 \u0442\u04e7\u0440\u0456\u043f\u0447\u0435 \u043f\u0430\u0437\u0430 \u0442\u0438\u04a3 \u043f\u043e\u0441\u0442\u044b\u04a3 \u0441\u0438\u043d\u0456\u043d \u043f\u0456\u043b\u0456\u043d\u0433\u0435\u043d\u0456\u043d \u043f\u0430\u0437\u0430 \u0442\u04e7\u0440\u0435\u043b\u0435\u0440\u0456\u043d\u0456\u04a3\u0434\u0435 \u043f\u043e\u043b\u0447\u0430. \u041e\u043b\u0430\u0440\u0434\u044b\u04a3 \u0441\u0430\u0493\u044b\u043d\u0493\u0430\u043d\u044b \u043f\u0430\u0437\u0430 \u0430\u0440\u044b\u0493 \u0441\u0430\u0493\u044b\u0441 \u043f\u0430\u0440 \u043f\u0430\u0437\u0430 \u0445\u0430\u0440\u044b\u043d\u0434\u0430\u0441\u0442\u0430\u0440 \u0447\u0456\u043b\u0438 \u0442\u0443\u0434\u044b\u043d\u0430\u0440\u0493\u0430 \u043a\u0438\u0440\u0435\u043a\u0442\u0435\u0440.", + "text": "Khakas ĐŸĐ¾Đ»̉“Đ°Đ½ Đ½Đ° ĐºÑ–Đ·Ñ– Đ¿Đ¾Ñ Đ¿Đ°Đ·Đ° Ñ‚Đ¸̉£ Ñ‚Ó§Ñ€Ñ–Đ¿Ñ‡Đµ Đ¿Đ°Đ·Đ° Ñ‚Đ¸̉£ Đ¿Đ¾Ñты̉£ ÑĐ¸Đ½Ñ–Đ½ Đ¿Ñ–Đ»Ñ–Đ½Đ³ĐµĐ½Ñ–Đ½ Đ¿Đ°Đ·Đ° Ñ‚Ó§Ñ€ĐµĐ»ĐµÑ€Ñ–Đ½Ñ–̉£Đ´Đµ Đ¿Đ¾Đ»Ñ‡Đ°. ĐĐ»Đ°Ñ€Đ´Ñ‹̉£ Ñа̉“Ñ‹Đ½̉“Đ°Đ½Ñ‹ Đ¿Đ°Đ·Đ° Đ°Ñ€Ñ‹̉“ Ñа̉“Ñ‹Ñ Đ¿Đ°Ñ€ Đ¿Đ°Đ·Đ° Ñ…Đ°Ñ€Ñ‹Đ½Đ´Đ°ÑÑ‚Đ°Ñ€ Ñ‡Ñ–Đ»Đ¸ Ñ‚ÑƒĐ´Ñ‹Đ½Đ°Ñ€̉“а ĐºĐ¸Ñ€ĐµĐºÑ‚ĐµÑ€.", "metadata": { "languages": [ "ukr", @@ -5016,7 +5016,7 @@ { "type": "NarrativeText", "element_id": "74a93facd90bf0553bdf368698baa2a5", - "text": "Khasi \u00cfa ki bynriew baroh la kha laitluid bad ki \u00efaryngkat ha ka burom bad ki hok. Ha ki la bsiap da ka bor pyrkhat bad ka jing\u00efatiplem bad ha ka mynsiem jingsngew shipara ki dei ban \u00efatrei bynrap lang.", + "text": "Khasi Ăa ki bynriew baroh la kha laitluid bad ki Ă¯aryngkat ha ka burom bad ki hok. Ha ki la bsiap da ka bor pyrkhat bad ka jingĂ¯atiplem bad ha ka mynsiem jingsngew shipara ki dei ban Ă¯atrei bynrap lang.", "metadata": { "languages": [ "ind", @@ -5038,7 +5038,7 @@ { "type": "Title", "element_id": "b6ab4d5f0569e217cd985de6b9f5ca73", - "text": "Khmer, Central \u1798\u1793\u17bb\u179f\u17d2\u179f\u1791\u17b6\u17c6\u1784\u17a2\u179f\u17cb \u1780\u17be\u178f\u1798\u1780\u1798\u17b6\u1793\u179f\u17c1\u179a\u17b8\u1797\u17b6\u1796 \u1793\u17b7\u1784\u179f\u1798\u1797\u17b6\u1796 \u1780\u17d2\u1793\u17bb\u1784\u1795\u17d2\u1793\u17c2\u1780\u179f\u17c1\u1785\u1780\u17d2\u178a\u17b8\u1790\u17d2\u179b\u17c3\u1790\u17d2\u1793\u17bc\u179a\u1793\u17b7\u1784\u179f\u17b7\u1791\u17d2\u1792\u17b7\u17d4 \u1798\u1793\u17bb\u179f\u17d2\u179f \u1798\u17b6\u1793\u179c\u17b7\u1785\u17b6\u179a\u178e\u1789\u17d2\u1789\u17b6\u178e\u1793\u17b7\u1784\u179f\u178f\u17b7\u179f\u1798\u17d2\u1794\u1787\u1789\u17d2\u1789\u17c8\u1787\u17b6\u1794\u17cb\u1796\u17b8\u1780\u17c6\u178e\u17be\u178f \u17a0\u17be\u1799\u1782\u1794\u17d2\u1794\u17b8\u1794\u17d2\u179a\u1796\u17d2\u179a\u17b9\u178f\u17d2\u178a\u1785\u17c6\u1796\u17c4\u17c7\u1782\u17d2\u1793\u17b6\u1791\u17c5\u179c\u17b7\u1789\u1791\u17c5\u1798\u1780\u1780\u17d2\u1793\u17bb\u1784\u179f\u17d2\u1798\u17b6\u179a\u178f\u17b8\u1797\u17b6\u178f\u179a\u1797\u17b6\u1796\u1787\u17b6\u1794\u1784\u1794\u17d2\u17a2\u17bc\u1793\u17d4", + "text": "Khmer, Central á˜á“á»áŸáŸ’áŸá‘á¶áŸ†á„á¢áŸáŸ‹ á€á¾áá˜á€á˜á¶á“áŸáŸáá¸á—á¶á– á“á·á„áŸá˜á—á¶á– á€áŸ’á“á»á„á•្á“ែá€áŸáŸá…á€áŸ’áá¸á្á›áŸƒá្á“á¼áá“á·á„áŸá·á‘្á’á·áŸ” á˜á“á»áŸáŸ’០á˜á¶á“áœá·á…á¶ááá‰áŸ’á‰á¶áá“á·á„áŸáá·áŸá˜áŸ’á”á‡á‰áŸ’á‰áŸˆá‡á¶á”់á–á¸á€áŸ†áá¾á á á¾á™á‚á”្á”á¸á”្áá–្áá¹á្áá…ំá–ោះá‚្á“á¶á‘ៅáœá·á‰á‘ៅá˜á€á€áŸ’á“á»á„áŸáŸ’á˜á¶ááá¸á—á¶ááá—á¶á–á‡á¶á”á„á”្á¢á¼á“។", "metadata": { "filetype": "text/plain", "data_source": { @@ -5056,7 +5056,7 @@ { "type": "Title", "element_id": "841467ed91005c2b65ccce68e9bac719", - "text": "Kh\u00fcn \u1a3e\u1a36\u1a69\u1a54\u1a7c\u1a34\u1a60\u1a26\u1a62\u1a49\u1a56\u1a63\u1a60\u1a3f\u1a20\u1a6e\u1a60\u1a2f\u1a68\u1a3e\u1a63\u1a3e\u1a66\u1a3b\u1a60\u1a26\u1a48\u1a41\u1a53\u1a62\u1a39\u1a60\u1a3f\u1a75\u1a26\u1a3b\u1a60\u1a3f\u1a26\u1a20\u1a60\u1a36\u1a62 \u1a36\u1a71\u1a20\u1a65\u1a32\u1a60\u1a32\u1a65\u1a48\u1a60\u1a20\u1a62 \u1a53\u1a62\u1a48\u1a65\u1a34\u1a60\u1a35\u1a65 \u1a32\u1a75\u1a63\u1a60\u1a26\u1a23\u1a73\u1a76\u1a23\u1a62\u1a3e\u1a66\u1a3e\u1a36\u1a6e\u1a63\u1a35\u1a3e\u1a60\u1a3e\u1a7c\u1a53\u1a62 \u1a23\u1a60\u1a45\u1a41\u1a37\u1a2d\u1a65\u1a37\u1a60\u1a32\u1a62\u1a32\u1a73\u1a75\u1a20\u1a60\u1a36\u1a62\u1a2f\u1a62\u1a60\u1a45\u1a60\u1a3f\u1a23\u1a60\u1a45\u1a63\u1a60\u1a3e\u1a39\u1a60\u1a3f\u1a75\u1a26\u1a3b\u1a60\u1a3f\u1a26\u1a20\u1a60\u1a36\u1a62", + "text": "KhĂ¼n ᨾᨶᩩᩔ᩼ᨴ᩠ᨦᩢᩉᩖᩣ᩠ᨿᨠᩮ᩠ᨯᩨᨾᩣᨾᩦᨻ᩠ᨦᩈá©á©“ᩢᨹ᩠ᨿ᩵ᨦᨻ᩠ᨿᨦᨠ᩠ᨶᩢ ᨶᩱᨠᩥᨲ᩠ᨲᩥᩈ᩠ᨠᩢ ᩓᩢᩈᩥᨴ᩠ᨵᩥ ᨲ᩵ᩣ᩠ᨦᨣᩳ᩶ᨣᩢᨾᩦᨾᨶᩮᩣᨵᨾ᩠ᨾ᩼ᩓᩢ ᨣ᩠ᩅá©á¨·á¨­á©¥á¨·á© á¨²á©¢á¨²á©³á©µá¨ á© á¨¶á©¢á¨¯á©¢á© á©…᩠ᨿᨣ᩠ᩅᩣ᩠ᨾᨹ᩠ᨿ᩵ᨦᨻ᩠ᨿᨦᨠ᩠ᨶᩢ", "metadata": { "languages": [ "tur" @@ -5077,7 +5077,7 @@ { "type": "NarrativeText", "element_id": "7abc18c11be0eb0d9f9526fbe76af972", - "text": "Kirghiz \u0411\u0430\u0440\u0434\u044b\u043a \u0430\u0434\u0430\u043c\u0434\u0430\u0440 \u04e9\u0437 \u0431\u0435\u0434\u0435\u043b\u0438\u043d\u0434\u0435 \u0436\u0430\u043d\u0430 \u0443\u043a\u0443\u043a\u0442\u0430\u0440\u044b\u043d\u0434\u0430 \u044d\u0440\u043a\u0438\u043d \u0436\u0430\u043d\u0430 \u0442\u0435\u04a3 \u0443\u043a\u0443\u043a\u0442\u0443\u0443 \u0431\u043e\u043b\u0443\u043f \u0436\u0430\u0440\u0430\u043b\u0430\u0442. \u0410\u043b\u0430\u0440\u0434\u044b\u043d \u0430\u04a3\u2010\u0441\u0435\u0437\u0438\u043c\u0438 \u043c\u0435\u043d\u0435\u043d \u0430\u0431\u0438\u0439\u0438\u0440\u0438 \u0431\u0430\u0440 \u0436\u0430\u043d\u0430 \u0431\u0438\u0440\u0438\u2010\u0431\u0438\u0440\u0438\u043d\u0435 \u0431\u0438\u0440 \u0442\u0443\u0443\u0433\u0430\u043d\u0434\u044b\u043a \u043c\u0430\u043c\u0438\u043b\u0435\u043a\u044b\u043b\u0443\u0443\u0433\u0430 \u0442\u0438\u0439\u0438\u0448.", + "text": "Kirghiz Đ‘Đ°Ñ€Đ´Ñ‹Đº Đ°Đ´Đ°Đ¼Đ´Đ°Ñ€ Ó©Đ· Đ±ĐµĐ´ĐµĐ»Đ¸Đ½Đ´Đµ Đ¶Đ°Đ½Đ° ÑƒĐºÑƒĐºÑ‚Đ°Ñ€Ñ‹Đ½Đ´Đ° ÑÑ€ĐºĐ¸Đ½ Đ¶Đ°Đ½Đ° Ñ‚Đµ̉£ ÑƒĐºÑƒĐºÑ‚ÑƒÑƒ Đ±Đ¾Đ»ÑƒĐ¿ Đ¶Đ°Ñ€Đ°Đ»Đ°Ñ‚. ĐĐ»Đ°Ñ€Đ´Ñ‹Đ½ а̉£â€ÑĐµĐ·Đ¸Đ¼Đ¸ Đ¼ĐµĐ½ĐµĐ½ Đ°Đ±Đ¸Đ¹Đ¸Ñ€Đ¸ Đ±Đ°Ñ€ Đ¶Đ°Đ½Đ° Đ±Đ¸Ñ€Đ¸â€Đ±Đ¸Ñ€Đ¸Đ½Đµ Đ±Đ¸Ñ€ Ñ‚ÑƒÑƒĐ³Đ°Đ½Đ´Ñ‹Đº Đ¼Đ°Đ¼Đ¸Đ»ĐµĐºÑ‹Đ»ÑƒÑƒĐ³Đ° Ñ‚Đ¸Đ¹Đ¸Ñˆ.", "metadata": { "languages": [ "rus", @@ -5099,7 +5099,7 @@ { "type": "NarrativeText", "element_id": "2490211a751af08c831f437250d70884", - "text": "Kissi, Northern wanda tu cio M\u025b pil\u0254\u0254 o wol\u0254\u0254 ni, le waa o ba nd\u0254\u0254 cio, o b\u025b\u025blen kenando ni, o t\u0254ngdo ni, b\u025btu n\u0254n yiyando a kullo, o kon ni naan tu dua mim maalyan kalapil\u0254y\u025byi ni.", + "text": "Kissi, Northern wanda tu cio MÉ› pilɔɔ o wolɔɔ ni, le waa o ba ndɔɔ cio, o bɛɛlen kenando ni, o tÉ”ngdo ni, bÉ›tu nÉ”n yiyando a kullo, o kon ni naan tu dua mim maalyan kalapilÉ”yÉ›yi ni.", "metadata": { "languages": [ "tgl", @@ -5166,7 +5166,7 @@ { "type": "NarrativeText", "element_id": "3da488a598903b0fa6a89a4d9b704219", - "text": "Komi-Permyak \u0411\u044b\u0434\u04e7\u0441 \u043e\u0442\u0438\u0440\u044b\u0441 \u0447\u0443\u0436\u04e7\u043d\u044b \u0432\u043e\u043b\u044c\u043d\u04e7\u0439\u0435\u0437\u04e7\u043d \u0434\u0430 \u04e7\u0442\u043a\u043e\u0434\u0434\u0435\u0437\u04e7\u043d \u0434\u043e\u0441\u0442\u043e\u0438\u043d\u0441\u0442\u0432\u043e\u044b\u043d \u0434\u0430 \u043f\u0440\u0430\u0432\u043e\u044d\u0437\u044b\u043d. \u041d\u044b\u043b\u04e7 \u0441\u0435\u0442\u04e7\u043c \u043c\u044b\u0432\u043a\u044b\u0434 \u0434\u0430 \u0441\u043e\u0432\u0435\u0441\u0442\u044c \u043e\u0432\u043d\u044b \u04e7\u0442\u0430\u043c\u04e7\u0434\u043d\u044b\u0441\u043a\u04e7\u0442 \u043a\u044b\u0434\u0437 \u0432\u043e\u043d\u043d\u044d\u0437\u043b\u04e7.", + "text": "Komi-Permyak Đ‘Ñ‹Đ´Ó§Ñ Đ¾Ñ‚Đ¸Ñ€Ñ‹Ñ Ñ‡ÑƒĐ¶Ó§Đ½Ñ‹ Đ²Đ¾Đ»ÑŒĐ½Ó§Đ¹ĐµĐ·Ó§Đ½ да Ó§Ñ‚ĐºĐ¾Đ´Đ´ĐµĐ·Ó§Đ½ Đ´Đ¾ÑÑ‚Đ¾Đ¸Đ½ÑÑ‚Đ²Đ¾Ñ‹Đ½ да Đ¿Ñ€Đ°Đ²Đ¾ÑĐ·Ñ‹Đ½. ĐÑ‹Đ»Ó§ ÑĐµÑ‚Ó§Đ¼ Đ¼Ñ‹Đ²ĐºÑ‹Đ´ да ÑĐ¾Đ²ĐµÑть Đ¾Đ²Đ½Ñ‹ Ó§Ñ‚Đ°Đ¼Ó§Đ´Đ½Ñ‹ÑĐºÓ§Ñ‚ ĐºÑ‹Đ´Đ· Đ²Đ¾Đ½Đ½Ñзлӧ.", "metadata": { "languages": [ "rus" @@ -5251,7 +5251,7 @@ { "type": "NarrativeText", "element_id": "71cc3fa5f30f347d8e225e871139661f", - "text": "Korean \ubaa8\ub4e0 \uc778\uac04\uc740 \ud0dc\uc5b4\ub0a0 \ub54c\ubd80\ud130 \uc790\uc720\ub85c\uc6b0\uba70 \uadf8 \uc874\uc5c4\uacfc \uad8c\ub9ac\uc5d0 \uc788\uc5b4 \ub3d9\ub4f1\ud558\ub2e4. \uc778\uac04\uc740 \ucc9c\ubd80\uc801\uc73c\ub85c \uc774\uc131\uacfc \uc591\uc2ec\uc744 \ubd80\uc5ec\ubc1b\uc558\uc73c\uba70 \uc11c\ub85c \ud615\uc81c\uc560\uc758 \uc815\uc2e0\uc73c\ub85c \ud589\ub3d9\ud558\uc5ec\uc57c \ud55c\ub2e4.", + "text": "Korean 모든 ́¸ê°„́€ 태́–´ë‚  때부터 ́́œ ë¡œ́°ë©° ê·¸ ́¡´́—„ê³¼ 권리́— ́ˆ́–´ ë™ë“±í•˜ë‹¤. ́¸ê°„́€ ́²œë¶€́ ́œ¼ë¡œ ́´́„±ê³¼ ́–‘́‹¬́„ ë¶€́—¬ë°›́•˜́œ¼ë©° ́„œë¡œ 형́ œ́• ́˜ ́ •́‹ ́œ¼ë¡œ í–‰ë™í•˜́—¬́•¼ 한다.", "metadata": { "languages": [ "kor" @@ -5272,7 +5272,7 @@ { "type": "NarrativeText", "element_id": "ec837c06df9c110c22e734be4704e763", - "text": "Kpelle, Guinea Nukan gele kaa p\u0259l\u0259 kaa tan\u0254n, yili\u0253a nu k\u0259le maawiy\u0259 p\u0259l\u0259 da t\u0254\u0254i gaa \u0272ei y\u025bn\u025byii hu k\u025bp\u0259l\u0259 kaal\u0254 tan\u0254n; di k\u025bm\u025bni a nukan \u014baa \u0253\u0259 g\u025b\u025b hw\u0259k\u025bli w\u025blik\u025bmaa \u0259 l\u0254 di luwai.", + "text": "Kpelle, Guinea Nukan gele kaa pÉ™lÉ™ kaa tanÉ”n, yiliÉ“a nu kÉ™le maawiyÉ™ pÉ™lÉ™ da tɔɔi gaa ɲei yÉ›nÉ›yii hu kÉ›pÉ™lÉ™ kaalÉ” tanÉ”n; di kÉ›mÉ›ni a nukan Å‹aa ɓə gɛɛ hwÉ™kÉ›li wÉ›likÉ›maa É™ lÉ” di luwai.", "metadata": { "languages": [ "som", @@ -5294,7 +5294,7 @@ { "type": "NarrativeText", "element_id": "6322dea6cfe74f4e5e0272752dccffb4", - "text": "Krio \u025bvrib\u0254di b\u0254n fri \u025bn g\u025bt in yon rayt, n\u0254n wan n\u0254 pas in k\u0254mpin. Wi \u0254l ebul f\u0254 tink \u025bn f\u025bn\u0254t wetin rayt \u025bn r\u0254\u014b pantap dat wi f\u0254 sabi aw f\u0254 liv l\u025bk wan big famili.", + "text": "Krio É›vribÉ”di bÉ”n fri É›n gÉ›t in yon rayt, nÉ”n wan nÉ” pas in kÉ”mpin. Wi É”l ebul fÉ” tink É›n fÉ›nÉ”t wetin rayt É›n rɔŋ pantap dat wi fÉ” sabi aw fÉ” liv lÉ›k wan big famili.", "metadata": { "languages": [ "ind", @@ -5317,7 +5317,7 @@ { "type": "NarrativeText", "element_id": "e4653071cb4a8a4f59ca7f62a50afbb4", - "text": "Kulango, Bouna Igooyoo p\u025b\u025b h\u028bn taa. B\u0254 p\u025b\u025b jabaga b\u0254r\u0254. H\u0254 ya g\u028b\u028bn\u2019n b\u0254\u0254 h\u025b p\u025b\u025b, h\u0254 h\u025b gus\u025bg\u025b\u2019n.", + "text": "Kulango, Bouna Igooyoo pɛɛ hÊ‹n taa. BÉ” pɛɛ jabaga bÉ”rÉ”. HÉ” ya gʋʋn’n bɔɔ hÉ› pɛɛ, hÉ” hÉ› gusÉ›gɛ’n.", "metadata": { "languages": [ "tgl", @@ -5339,7 +5339,7 @@ { "type": "NarrativeText", "element_id": "df4b88e2493c88f7b478eaece77dfdb7", - "text": "Kurdish, Central Hem\u00fb mirov azad \u00fb di weqar \u00fb mafan de wekhev t\u00ean dinyay\u00ea. Ew xwed\u00ee hi\u015f \u00fb \u015fu\u00fbr in \u00fb div\u00ea li hember hev bi zihniyeteke bratiy\u00ea bilivin.", + "text": "Kurdish, Central HemĂ» mirov azad Ă» di weqar Ă» mafan de wekhev tĂªn dinyayĂª. Ew xwedĂ® hiÅŸ Ă» ÅŸuĂ»r in Ă» divĂª li hember hev bi zihniyeteke bratiyĂª bilivin.", "metadata": { "languages": [ "tur", @@ -5362,7 +5362,7 @@ { "type": "NarrativeText", "element_id": "26a7611f793432bd8ce6f6cb35470ad5", - "text": "Kurdish, Northern Hem\u00fb mirov azad \u00fb di weqar \u00fb mafan de wekhev t\u00ean dinyay\u00ea. Ew xwed\u00ee hi\u015f \u00fb \u015fu\u00fbr in \u00fb div\u00ea li hember hev bi zihniyeteke bratiy\u00ea bilivin.", + "text": "Kurdish, Northern HemĂ» mirov azad Ă» di weqar Ă» mafan de wekhev tĂªn dinyayĂª. Ew xwedĂ® hiÅŸ Ă» ÅŸuĂ»r in Ă» divĂª li hember hev bi zihniyeteke bratiyĂª bilivin.", "metadata": { "languages": [ "nld", @@ -5386,7 +5386,7 @@ { "type": "NarrativeText", "element_id": "0eaf9123417f2794584c7cfd20e10aee", - "text": "Ladin D\u00f6tes les porsones nasc l\u00ebdies y cun la medema dignit\u00e9 y i medemi d\u00ebr\u0107. Ares \u00e0 na rajun y na cosci\u00ebnza y m\u00ebss s\u2019incunt\u00e8 \u00f6na cun l\u2019atra te n spirit de fraternit\u00e9.", + "text": "Ladin Dötes les porsones nasc lĂ«dies y cun la medema dignitĂ© y i medemi dĂ«rć. Ares Ă  na rajun y na cosciĂ«nza y mĂ«ss s’incuntè öna cun l’atra te n spirit de fraternitĂ©.", "metadata": { "languages": [ "spa", @@ -5429,7 +5429,7 @@ { "type": "NarrativeText", "element_id": "5590b8f08d34a13d98afa307c3a0db0a", - "text": "Lamnso' \u00c1 dz\u0259\u0300\u0259\u0301 wir dz\u0259\u0300m r\u00e9\u014br\u00e9\u014b f\u00f3 ghv\u0259m w\u00f9n \u00e0 f\u00f3 gh\u00e0y, \u00e1 yo\u2019 dz\u0259\u0300\u0259\u0301 wir ms\u00f2\u014b ji kw\u00e0n. W\u00ecr dz\u0259\u0300m k\u0300m k f\u00f3mo woo f\u00f3 kw\u00e0\u2019t\u00ec w\u00f9n \u00e0 f\u00f3 vifii, a w\u00f9 k\u00e9r f\u00f3 a yi\u00ec e w\u00f9m\u00f2\u2019 woo w\u00edr moo f\u0259\u0301r v\u0259.", + "text": "Lamnso' Ă dzÉ™̀€É™̀ wir dzÉ™̀€m rĂ©Å‹rĂ©Å‹ fĂ³ ghvÉ™m wĂ¹n Ă  fĂ³ ghĂ y, Ă¡ yo’ dzÉ™̀€É™̀ wir msĂ²Å‹ ji kwĂ n. Wìr dzÉ™̀€m k̀€m k fĂ³mo woo fĂ³ kwà’tì wĂ¹n Ă  fĂ³ vifii, a wĂ¹ kĂ©r fĂ³ a yiì e wĂ¹mĂ²â€™ woo wĂ­r moo fÉ™̀r vÉ™.", "metadata": { "languages": [ "vie", @@ -5452,7 +5452,7 @@ { "type": "Title", "element_id": "ae451bf94c5e07470540741833822372", - "text": "Lao \u0ea1\u0eb0\u0e99\u0eb8\u0e94\u0ec0\u0e81\u0eb5\u0e94\u0ea1\u0eb2\u0ea1\u0eb5\u0eaa\u0eb4\u0e94\u0ec0\u0eaa\u0ea5\u0eb5\u0e9e\u0eb2\u0e9a \u0ec1\u0ea5\u0eb0 \u0eaa\u0eb0\u0ec0\u0edd\u0eb5\u0edc\u0ec9\u0eb2\u0e81\u0eb1\u0e99\u0ec3\u0e99\u0e97\u0eb2\u0e87\u0e81\u0ebd\u0e94\u0e95\u0eb4\u0eaa\u0eb1\u0e81 \u0ec1\u0ea5\u0eb0 \u0e97\u0eb2\u0e87\u0eaa\u0eb4\u0e94\u0e94\u0ec9\u0ea7\u0e8d\u0ea1\u0eb0\u0e99\u0eb8\u0e94\u0ea1\u0eb5\u0eaa\u0eb0\u0e95\u0eb4\u0eaa\u0eb3\u0e9b\u0eb1\u0e94\u0e8a\u0eb1\u0e99\u0e8d\u0eb0(\u0eae\u0eb9\u0ec9\u0e94\u0eb5\u0eae\u0eb9\u0ec9\u0e8a\u0ebb\u0ec8\u0ea7)\u0ec1\u0ea5\u0eb0\u0ea1\u0eb5\u0ea1\u0eb0\u0ec2\u0e99\u0e97\u0eb3\u0e88\u0eb7\u0ec8\u0e87\u0e95\u0ec9\u0ead\u0e87\u0e9b\u0eb0\u0e9e\u0eb6\u0e94\u0e95\u0ebb\u0e99\u0e95\u0ecd\u0ec8\u0e81\u0eb1\u0e99\u0ec3\u0e99\u0e97\u0eb2\u0e87\u0e9e\u0eb5\u0ec8\u0e99\u0ec9\u0ead\u0e87.", + "text": "Lao ມະນຸດເàºàºµàº”ມາມີສິດເສລີàºàº²àº à»àº¥àº° ສະເà»àºµà»œà»‰àº²àºàº±àº™à»ƒàº™àº—າງàºàº½àº”ຕິສັຠà»àº¥àº° ທາງສິດດ້ວàºàº¡àº°àº™àº¸àº”ມີສະຕິສຳປັດàºàº±àº™àºàº°(ຮູ້ດີຮູ້àºàº»à»ˆàº§)à»àº¥àº°àº¡àºµàº¡àº°à»‚ນທຳຈື່ງຕ້ອງປະàºàº¶àº”ຕົນຕà»à»ˆàºàº±àº™à»ƒàº™àº—າງàºàºµà»ˆàº™à»‰àº­àº‡.", "metadata": { "filetype": "text/plain", "data_source": { @@ -5514,7 +5514,7 @@ { "type": "NarrativeText", "element_id": "6cddab55572e83cd679bab750a745b46", - "text": "Latvian Visi cilv\u0113ki piedzimst br\u012bvi un vienl\u012bdz\u012bgi sav\u0101 pa\u0161cie\u0146\u0101 un ties\u012bb\u0101s. Vi\u0146i ir apvelt\u012bti ar sapr\u0101tu un sirdsapzi\u0146u, un vi\u0146iem j\u0101izturas citam pret citu br\u0101l\u012bbas gar\u0101.", + "text": "Latvian Visi cilvÄ“ki piedzimst brÄ«vi un vienlÄ«dzÄ«gi savÄ paÅ¡cieÅ†Ä un tiesÄ«bÄs. Viņi ir apveltÄ«ti ar saprÄtu un sirdsapziņu, un viņiem jÄizturas citam pret citu brÄlÄ«bas garÄ.", "metadata": { "languages": [ "lav" @@ -5535,7 +5535,7 @@ { "type": "NarrativeText", "element_id": "84c7cce831ebebafd545d3767089bc8f", - "text": "Latvian (2) Visi cilv\u0113ki piedzimst br\u012bvi un vienl\u012bdz\u012bgi cie\u0146\u0101 un ties\u012bb\u0101s. Vi\u0146iem ir dots sapr\u0101ts un sirdsapzi\u0146a, un vi\u0146iem citam pret citu j\u0101izturas br\u0101l\u012bbas gar\u0101.", + "text": "Latvian (2) Visi cilvÄ“ki piedzimst brÄ«vi un vienlÄ«dzÄ«gi cieÅ†Ä un tiesÄ«bÄs. Viņiem ir dots saprÄts un sirdsapziņa, un viņiem citam pret citu jÄizturas brÄlÄ«bas garÄ.", "metadata": { "languages": [ "lav" @@ -5556,7 +5556,7 @@ { "type": "NarrativeText", "element_id": "c431b1dcba75dca04cdeaaa5388f19c0", - "text": "Ligurian Tutte e personn-e nascian libere e p\u00e6ge in dignit\u00e6 e driti. Son dot\u00e6 de raxon e coscensa e gh\u2019an da ag\u00ee l\u2019unn-a verso l\u2019atra inte \u2019n spirito de fradelansa.", + "text": "Ligurian Tutte e personn-e nascian libere e pæge in dignitæ e driti. Son dotæ de raxon e coscensa e gh’an da agĂ® l’unn-a verso l’atra inte ’n spirito de fradelansa.", "metadata": { "languages": [ "ita" @@ -5577,7 +5577,7 @@ { "type": "NarrativeText", "element_id": "693ef7caa32675b109893e37846d9f13", - "text": "Limba, West-Central Biya-m\u025bti fooma be kiyo ka kuyanka\u014b i\u014b kas\u025bmb\u025b m\u025bn\u025b in ka yiki. Bind\u025b ki\u014b ba niy\u0254 in masim\u0254k\u0254, maka yiina wo ka hu w\u025bndi yande.", + "text": "Limba, West-Central Biya-mÉ›ti fooma be kiyo ka kuyankaÅ‹ iÅ‹ kasÉ›mbÉ› mÉ›nÉ› in ka yiki. BindÉ› kiÅ‹ ba niyÉ” in masimÉ”kÉ”, maka yiina wo ka hu wÉ›ndi yande.", "metadata": { "languages": [ "swa" @@ -5620,7 +5620,7 @@ { "type": "NarrativeText", "element_id": "6fcb989c6e738221bc467859b15c2d51", - "text": "Lingala (tones) Bato ny\u0254\u0301ns\u0254 na mbo\u0301tama bazali\u0301 ns\u0254\u0301mi\u0301 mpe\u0301 bako\u0301ka\u0301ni\u0301 na lim\u025bmya mpe\u0301 makoki\u0301. Bazali\u0301 na may\u025b\u0301l\u025b mpe\u0301 basenge\u0301li\u0301 kova\u0301nda na bondeko o ka\u0301ti na bango\u0301.", + "text": "Lingala (tones) Bato nyÉ”̀nsÉ” na mbòtama bazalì nsÉ”̀mì mpè bakòkànì na limÉ›mya mpè makokì. Bazalì na mayÉ›̀lÉ› mpè basengèlì kovànda na bondeko o kàti na bangò.", "metadata": { "languages": [ "tgl", @@ -5643,7 +5643,7 @@ { "type": "NarrativeText", "element_id": "353adb6fb432616b715be3966a6d79bd", - "text": "Lithuanian Visi \u017emon\u0117s gimsta laisvi ir lyg\u016bs savo orumu ir teis\u0117mis. Jiems suteiktas protas ir s\u0105\u017ein\u0117 ir jie turi elgtis vienas kito at\u017evilgiu kaip broliai.", + "text": "Lithuanian Visi žmonÄ—s gimsta laisvi ir lygÅ«s savo orumu ir teisÄ—mis. Jiems suteiktas protas ir sąžinÄ— ir jie turi elgtis vienas kito atžvilgiu kaip broliai.", "metadata": { "languages": [ "lit" @@ -5664,7 +5664,7 @@ { "type": "NarrativeText", "element_id": "3e4f829a968d5f615b4245e85dc21d08", - "text": "Lobi Teehuu s\u028bn\u0254 n ther \u025b\u025b n\u0269\u0269 b\u028bn\u0254 wa n do deea\u0294 s\u0269 w\u028b n makha sam\u0269n\u0269 na n\u00e0 h\u028b t\u0269n\u025bpar r\u00e0. Thangba ti y\u025br \u00e0 p\u025b y\u025br j\u0269\u0269r n\u00e0 f\u0269lw\u025b s\u0269 a teena waan f\u028bkha omkhaa.", + "text": "Lobi Teehuu sÊ‹nÉ” n ther ɛɛ nɩɩ bÊ‹nÉ” wa n do deeaÊ” sÉ© wÊ‹ n makha samÉ©nÉ© na nĂ  hÊ‹ tÉ©nÉ›par rĂ . Thangba ti yÉ›r Ă  pÉ› yÉ›r jɩɩr nĂ  fÉ©lwÉ› sÉ© a teena waan fÊ‹kha omkhaa.", "metadata": { "languages": [ "som" @@ -5770,7 +5770,7 @@ { "type": "NarrativeText", "element_id": "3f8cca735e9bb8ee68adff123b7ebdda", - "text": "Luxembourgeois All M\u00ebnsch k\u00ebnnt fr\u00e4i a mat deer selwechter Dignit\u00e9it an dene selwechte Rechter op d'Welt. Jiddereen huet s\u00e4i Verstand a s\u00e4i Gew\u00ebsse krut an soll an engem Geescht vu Bridderlechkeet denen anere g\u00e9intiwwer handelen.", + "text": "Luxembourgeois All MĂ«nsch kĂ«nnt fräi a mat deer selwechter DignitĂ©it an dene selwechte Rechter op d'Welt. Jiddereen huet säi Verstand a säi GewĂ«sse krut an soll an engem Geescht vu Bridderlechkeet denen anere gĂ©intiwwer handelen.", "metadata": { "languages": [ "nld", @@ -5792,7 +5792,7 @@ { "type": "NarrativeText", "element_id": "1a2cc3d892dc79a4b68cc59db7a69ea1", - "text": "Macedonian \u0421\u0438\u0442\u0435 \u0447\u043e\u0432\u0435\u0447\u043a\u0438 \u0441\u0443\u0448\u0442\u0435\u0441\u0442\u0432\u0430 \u0441\u0435 \u0440\u0430\u0453\u0430\u0430\u0442 \u0441\u043b\u043e\u0431\u043e\u0434\u043d\u0438 \u0438 \u0435\u0434\u043d\u0430\u043a\u0432\u0438 \u043f\u043e \u0434\u043e\u0441\u0442\u043e\u0438\u043d\u0441\u0442\u0432\u043e \u0438 \u043f\u0440\u0430\u0432\u0430. \u0422\u0438\u0435 \u0441\u0435 \u043e\u0431\u0434\u0430\u0440\u0435\u043d\u0438 \u0441\u043e \u0440\u0430\u0437\u0443\u043c \u0438 \u0441\u043e\u0432\u0435\u0441\u0442 \u0438 \u0442\u0440\u0435\u0431\u0430 \u0434\u0430 \u0441\u0435 \u043e\u0434\u043d\u0435\u0441\u0443\u0432\u0430\u0430\u0442 \u0435\u0434\u0435\u043d \u043a\u043e\u043d \u0434\u0440\u0443\u0433 \u0432\u043e \u0434\u0443\u0445\u043e\u0442 \u043d\u0430 \u043e\u043f\u0448\u0442\u043e \u0447\u043e\u0432\u0435\u0447\u043a\u0430\u0442\u0430 \u043f\u0440\u0438\u043f\u0430\u0434\u043d\u043e\u0441\u0442.", + "text": "Macedonian Đ¡Đ¸Ñ‚Đµ Ñ‡Đ¾Đ²ĐµÑ‡ĐºĐ¸ ÑÑƒÑˆÑ‚ĐµÑÑ‚Đ²Đ° Ñе Ñ€Đ°Ñ“Đ°Đ°Ñ‚ ÑĐ»Đ¾Đ±Đ¾Đ´Đ½Đ¸ и ĐµĐ´Đ½Đ°ĐºĐ²Đ¸ Đ¿Đ¾ Đ´Đ¾ÑÑ‚Đ¾Đ¸Đ½ÑÑ‚Đ²Đ¾ и Đ¿Ñ€Đ°Đ²Đ°. Đ¢Đ¸Đµ Ñе Đ¾Đ±Đ´Đ°Ñ€ĐµĐ½Đ¸ ÑĐ¾ Ñ€Đ°Đ·ÑƒĐ¼ и ÑĐ¾Đ²ĐµÑÑ‚ и Ñ‚Ñ€ĐµĐ±Đ° да Ñе Đ¾Đ´Đ½ĐµÑÑƒĐ²Đ°Đ°Ñ‚ ĐµĐ´ĐµĐ½ ĐºĐ¾Đ½ Đ´Ñ€ÑƒĐ³ Đ²Đ¾ Đ´ÑƒÑ…Đ¾Ñ‚ Đ½Đ° Đ¾Đ¿ÑˆÑ‚Đ¾ Ñ‡Đ¾Đ²ĐµÑ‡ĐºĐ°Ñ‚Đ° Đ¿Ñ€Đ¸Đ¿Đ°Đ´Đ½Đ¾ÑÑ‚.", "metadata": { "languages": [ "mkd" @@ -5834,7 +5834,7 @@ { "type": "UncategorizedText", "element_id": "2e4fdb7fcd2748cce07840226331c829", - "text": "Magahi \u0938\u092c \u0932\u094b\u0917 \u0906\u091c\u093e\u0926\u0947 \u091c\u0928\u094d\u092e \u0932\u0947\u092c \u0939\u0908 \u0924\u0925\u093e \u0938\u092c \u0915\u0947 \u092c\u0930\u093e\u092c\u0930\u0947 \u0938\u092e\u094d\u092e\u093e\u0928 \u0914\u0930 \u0905\u0927\u093f\u0915\u093e\u0930 \u0939\u0907\u0964 \u0939\u0941\u0928\u0916\u094b \u0915\u0947 \u092a\u093e\u0938 \u0938\u092e\u091d-\u092c\u0942\u091d \u0914\u0930 \u0905\u0902\u0924:\u0915\u0930\u0923 \u0915\u0947 \u0906\u0935\u093e\u091c \u0939\u094b\u092c \u0939\u0908\u0964 \u0914\u0930 \u0939\u0941\u0928\u0915\u093e \u0926\u094b\u0938\u0930\u094b \u0915\u0947 \u0938\u093e\u0925 \u092d\u093e\u0908\u091a\u093e\u0930\u093e \u0915\u0947 \u0935\u094d\u092f\u0935\u0939\u093e\u0930 \u0915\u0930\u0947 \u092a\u0921\u093c \u0939\u0908\u0964", + "text": "Magahi सब लोग आजादे जनà¥à¤® लेब हई तथा सब के बराबरे समà¥à¤®à¤¾à¤¨ और अधिकार हइ। हà¥à¤¨à¤–ो के पास समà¤-बूठऔर अंत:करण के आवाज होब हई। और हà¥à¤¨à¤•ा दोसरो के साथ भाईà¤à¤¾à¤°à¤¾ के वà¥à¤¯à¤µà¤¹à¤¾à¤° करे पड़ हई।", "metadata": { "languages": [ "hin" @@ -5855,7 +5855,7 @@ { "type": "UncategorizedText", "element_id": "d691df62a8af33ae0b9c152a092e32a9", - "text": "Maithili \u0938\u092d \u092e\u093e\u0928\u0935 \u091c\u0928\u094d\u092e\u0924\u0903 \u0938\u094d\u0935\u0924\u0928\u094d\u0924\u094d\u0930 \u0905\u091b\u093f \u0924\u0925\u093e \u0917\u0930\u093f\u092e\u093e \u0906\u02bc \u0905\u0927\u093f\u0915\u093e\u0930\u092e\u0947 \u0938\u092e\u093e\u0928 \u0905\u091b\u093f\u0964 \u0938\u092d\u0915\u0947\u0901 \u0905\u092a\u0928\u2013\u0905\u092a\u0928 \u092c\u0941\u0926\u094d\u0927\u093f \u0906\u02bc \u0935\u093f\u0935\u0947\u0915 \u091b\u0948\u0915 \u0906\u0913\u0930 \u0938\u092d\u0915\u0947\u0901 \u090f\u0915 \u0926\u094b\u0938\u0930\u093e\u0915 \u092a\u094d\u0930\u0924\u093f \u0938\u094c\u0939\u093e\u0930\u094d\u0926\u092a\u0942\u0930\u094d\u0923 \u0935\u094d\u092f\u0935\u0939\u093e\u0930 \u0915\u0930\u092c\u093e\u0915 \u091a\u093e\u0939\u0940\u0964", + "text": "Maithili सभ मानव जनà¥à¤®à¤¤à¤ƒ सà¥à¤µà¤¤à¤¨à¥à¤¤à¥à¤° अछि तथा गरिमा आʼ अधिकारमे समान अछि। सभकेठअपन–अपन बà¥à¤¦à¥à¤§à¤¿ आʼ विवेक छैक आओर सभकेठà¤à¤• दोसराक पà¥à¤°à¤¤à¤¿ सौहारà¥à¤¦à¤ªà¥‚रà¥à¤£ वà¥à¤¯à¤µà¤¹à¤¾à¤° करबाक à¤à¤¾à¤¹à¥€à¥¤", "metadata": { "languages": [ "hin", @@ -5877,7 +5877,7 @@ { "type": "NarrativeText", "element_id": "d73cc566475e568433ff76c1fb6af485", - "text": "Makhuwa Atthu othene aniyaria oolikana ni owilamula moota ontthunaya okhala, variyari v\u2019edignidade ni edireito. Akhalanne esaria ni otthokelela, ahaana akhalasaka othene saya vamurettele.", + "text": "Makhuwa Atthu othene aniyaria oolikana ni owilamula moota ontthunaya okhala, variyari v’edignidade ni edireito. Akhalanne esaria ni otthokelela, ahaana akhalasaka othene saya vamurettele.", "metadata": { "languages": [ "swa", @@ -5900,7 +5900,7 @@ { "type": "NarrativeText", "element_id": "166af43c7950017574b550ca090a6ff8", - "text": "Makonde Vanu vohevohe vaidile n\u2019chilambo valendene. Vanijaliwa ulimala vene. Pavele vanu pave na ulongo.", + "text": "Makonde Vanu vohevohe vaidile n’chilambo valendene. Vanijaliwa ulimala vene. Pavele vanu pave na ulongo.", "metadata": { "languages": [ "est", @@ -5945,7 +5945,7 @@ { "type": "NarrativeText", "element_id": "e74053233c7584ace3ddb4357ac894b7", - "text": "Malay (Arabic) \u0633\u0645\u0648\u0627 \u0645\u0623\u0646\u0633\u064a \u062f\u0644\u0627\u0647\u064a\u0631\u0643\u0646 \u0628\u064a\u0628\u0633 \u062f\u0627\u0646 \u0633\u0627\u0645\u0631\u0627\u062a \u062f\u0631\u064a \u0633\u06ac\u064a \u0643\u0645\u0648\u0644\u064a\u0623\u0646 \u062f\u0627\u0646 \u062d\u0642\u0662. \u0645\u0631\u064a\u0643 \u0645\u0645\u06a4\u0648\u06bd\u0627\u064a \u06a4\u0645\u064a\u0643\u064a\u0631\u0646 \u062f\u0627\u0646 \u06a4\u0631\u0627\u0633\u0623\u0646 \u0647\u0627\u062a\u064a \u062f\u0627\u0646 \u0647\u0646\u062f\u0642\u0644\u0647 \u0628\u0631\u062a\u064a\u0646\u062f\u0642 \u062f \u0627\u0646\u062a\u0627\u0631\u0627 \u0633\u0627\u062a\u0648 \u0633\u0627\u0645 \u0644\u0627\u0626\u0646 \u062f\u06a0\u0646 \u0633\u0645\u0627\u06a0\u062a \u06a4\u0631\u0633\u0627\u0648\u062f\u0627\u0631\u0623\u0646.", + "text": "Malay (Arabic) سموا Ù…Ø£Ù†Ø³Ù Ø¯Ù„Ø§Ù‡ÙØ±ÙƒÙ† Ø¨ÙØ¨Ø³ دان سامرات Ø¯Ø±Ù Ø³Ú¬Ù ÙƒÙ…ÙˆÙ„ÙØ£Ù† دان حق٢. مرÙÙƒ ممڤوڽا٠ڤمÙÙƒÙØ±Ù† دان ڤراسأن هات٠دان هندقله برتÙندق د انتارا ساتو سام لائن دڠن سماڠت ڤرساودارأن.", "metadata": { "languages": [ "ara", @@ -5988,7 +5988,7 @@ { "type": "NarrativeText", "element_id": "563cefb3266bb81ad240fb3d631fb5b0", - "text": "Malayalam \u0d2e\u0d28\u0d41\u0d37\u0d4d\u0d2f\u0d30\u0d46\u0d32\u0d4d\u0d32\u0d3e\u0d35\u0d30\u0d41\u0d02 \u0d24\u0d41\u0d32\u0d4d\u0d2f\u0d3e\u0d35\u0d15\u0d3e\u0d36\u0d19\u0d4d\u0d19\u0d33\u0d4b\u0d1f\u0d41\u0d02 \u0d05\u0d28\u0d4d\u0d24\u0d38\u0d4d\u0d38\u0d4b\u0d1f\u0d41\u0d02 \u0d38\u0d4d\u0d35\u0d3e\u0d24\u0d28\u0d4d\u0d24\u0d4d\u0d30\u0d4d\u0d2f\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d41\u0d02\u0d15\u0d42\u0d1f\u0d3f \u0d1c\u0d28\u0d3f\u0d1a\u0d4d\u0d1a\u0d3f\u0d1f\u0d4d\u0d1f\u0d41\u0d33\u0d4d\u0d33\u0d35\u0d30\u0d3e\u0d23\u0d4d\u200c. \u0d05\u0d28\u0d4d\u0d2f\u0d4b\u0d28\u0d4d\u0d2f\u0d02 \u0d2d\u0d4d\u0d30\u0d3e\u0d24\u0d43\u0d2d\u0d3e\u0d35\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46 \u0d2a\u0d46\u0d30\u0d41\u0d2e\u0d3e\u0d31\u0d41\u0d35\u0d3e\u0d28\u0d3e\u0d23\u0d4d\u200c \u0d2e\u0d28\u0d41\u0d37\u0d4d\u0d2f\u0d28\u0d4d\u0d28\u0d41 \u0d35\u0d3f\u0d35\u0d47\u0d15\u0d2c\u0d41\u0d26\u0d4d\u0d27\u0d3f\u0d2f\u0d41\u0d02 \u0d2e\u0d28\u0d38\u0d4d\u0d38\u0d3e\u0d15\u0d4d\u0d37\u0d3f\u0d2f\u0d41\u0d02 \u0d38\u0d3f\u0d26\u0d4d\u0d27\u0d2e\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d\u200c.", + "text": "Malayalam മനàµà´·àµà´¯à´°àµ†à´²àµà´²à´¾à´µà´°àµà´‚ à´¤àµà´²àµà´¯à´¾à´µà´•ാശങàµà´™à´³àµ‹à´Ÿàµà´‚ à´…à´¨àµà´¤à´¸àµà´¸àµ‹à´Ÿàµà´‚ à´¸àµà´µà´¾à´¤à´¨àµà´¤àµà´°àµà´¯à´¤àµà´¤àµ‹à´Ÿàµà´‚കൂടി ജനിà´àµà´à´¿à´Ÿàµà´Ÿàµà´³àµà´³à´µà´°à´¾à´£àµâ€Œ. à´…à´¨àµà´¯àµ‹à´¨àµà´¯à´‚ à´­àµà´°à´¾à´¤àµƒà´­à´¾à´µà´¤àµà´¤àµ‹à´Ÿàµ† പെരàµà´®à´¾à´±àµà´µà´¾à´¨à´¾à´£àµâ€Œ മനàµà´·àµà´¯à´¨àµà´¨àµ വിവേകബàµà´¦àµà´§à´¿à´¯àµà´‚ മനസàµà´¸à´¾à´•àµà´·à´¿à´¯àµà´‚ സിദàµà´§à´®à´¾à´¯à´¿à´°à´¿à´•àµà´•àµà´¨àµà´¨à´¤àµâ€Œ.", "metadata": { "languages": [ "mal" @@ -6009,7 +6009,7 @@ { "type": "NarrativeText", "element_id": "a1c5471ea369ac3ba44f2829262f62aa", - "text": "Malayalam \u0d2e\u0d28\u0d41\u0d37\u0d4d\u0d2f\u0d30\u0d46\u0d32\u0d4d\u0d32\u0d3e\u0d35\u0d30\u0d41\u0d02 \u0d24\u0d41\u0d32\u0d4d\u0d2f\u0d3e\u0d35\u0d15\u0d3e\u0d36\u0d19\u0d4d\u0d19\u0d33\u0d4b\u0d1f\u0d41\u0d02 \u0d05\u0d28\u0d4d\u0d24\u0d38\u0d4d\u0d38\u0d4b\u0d1f\u0d41\u0d02 \u0d38\u0d4d\u0d35\u0d3e\u0d24\u0d28\u0d4d\u0d24\u0d4d\u0d30\u0d4d\u0d2f\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d41\u0d02\u0d15\u0d42\u0d1f\u0d3f \u0d1c\u0d28\u0d3f\u0d1a\u0d4d\u0d1a\u0d3f\u0d1f\u0d4d\u0d1f\u0d41\u0d33\u0d4d\u0d33\u0d35\u0d30\u0d3e\u0d23\u0d4d\u200c. \u0d05\u0d28\u0d4d\u0d2f\u0d4b\u0d28\u0d4d\u0d2f\u0d02 \u0d2d\u0d4d\u0d30\u0d3e\u0d24\u0d43\u0d2d\u0d3e\u0d35\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46 \u0d2a\u0d46\u0d30\u0d41\u0d2e\u0d3e\u0d31\u0d41\u0d35\u0d3e\u0d28\u0d3e\u0d23\u0d4d\u200c \u0d2e\u0d28\u0d41\u0d37\u0d4d\u0d2f\u0d28\u0d4d\u0d28\u0d41 \u0d35\u0d3f\u0d35\u0d47\u0d15\u0d2c\u0d41\u0d26\u0d4d\u0d27\u0d3f\u0d2f\u0d41\u0d02 \u0d2e\u0d28\u0d38\u0d4d\u0d38\u0d3e\u0d15\u0d4d\u0d37\u0d3f\u0d2f\u0d41\u0d02 \u0d38\u0d3f\u0d26\u0d4d\u0d27\u0d2e\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d\u200c.", + "text": "Malayalam മനàµà´·àµà´¯à´°àµ†à´²àµà´²à´¾à´µà´°àµà´‚ à´¤àµà´²àµà´¯à´¾à´µà´•ാശങàµà´™à´³àµ‹à´Ÿàµà´‚ à´…à´¨àµà´¤à´¸àµà´¸àµ‹à´Ÿàµà´‚ à´¸àµà´µà´¾à´¤à´¨àµà´¤àµà´°àµà´¯à´¤àµà´¤àµ‹à´Ÿàµà´‚കൂടി ജനിà´àµà´à´¿à´Ÿàµà´Ÿàµà´³àµà´³à´µà´°à´¾à´£àµâ€Œ. à´…à´¨àµà´¯àµ‹à´¨àµà´¯à´‚ à´­àµà´°à´¾à´¤àµƒà´­à´¾à´µà´¤àµà´¤àµ‹à´Ÿàµ† പെരàµà´®à´¾à´±àµà´µà´¾à´¨à´¾à´£àµâ€Œ മനàµà´·àµà´¯à´¨àµà´¨àµ വിവേകബàµà´¦àµà´§à´¿à´¯àµà´‚ മനസàµà´¸à´¾à´•àµà´·à´¿à´¯àµà´‚ സിദàµà´§à´®à´¾à´¯à´¿à´°à´¿à´•àµà´•àµà´¨àµà´¨à´¤àµâ€Œ.", "metadata": { "languages": [ "mal" @@ -6030,7 +6030,7 @@ { "type": "NarrativeText", "element_id": "abe9340337f1806d7c7bb1e55e23819f", - "text": "Maldivian \u0780\u07aa\u0783\u07a8\u0780\u07a7 \u0787\u07a8\u0782\u07b0\u0790\u07a7\u0782\u07aa\u0782\u07b0\u0788\u07ac\u0790\u07b0 \u078b\u07aa\u0782\u07a8\u0794\u07ac\u0787\u07a6\u0781\u07b0 \u0787\u07aa\u078a\u07a6\u0782\u07b0\u0788\u07a6\u0782\u07a9\u060c \u0789\u07a8\u0782\u07a8\u0788\u07a6\u0782\u07b0\u0786\u07a6\u0789\u07aa\u078e\u07a6\u0787\u07a8\u060c \u0780\u07a6\u0789\u07a6\u0780\u07a6\u0789\u07a6 \u0799\u07a6\u0787\u07b0\u07a4\u07aa\u078c\u07a6\u0786\u07a6\u0786\u07a7\u0787\u07ac\u0786\u07aa\u060c \u0780\u07a6\u0789\u07a6\u0780\u07a6\u0789\u07a6 \u078b\u07a6\u0783\u07a6\u0796\u07a6\u0787\u07ac\u0787\u07b0\u078e\u07a6\u0787\u07a8 \u0786\u07a6\u0789\u07ad\u0780\u07a8\u078c\u07ac\u0788\u07a8\u078e\u07ac\u0782\u07b0\u0788\u07a7 \u0784\u07a6\u0787\u07ac\u0787\u07b0\u078e\u07ac \u078e\u07ae\u078c\u07aa\u078e\u07a6\u0787\u07ac\u0788\u07ac. \u0780\u07ac\u0794\u07ae \u0788\u07a8\u0790\u07b0\u0782\u07aa\u0789\u07a7\u0787\u07a8\u060c \u0780\u07ac\u0794\u07ae\u0784\u07aa\u0787\u07b0\u078b\u07a9\u078e\u07ac \u0784\u07a7\u0783\u07aa \u0787\u07ac\u0789\u07a9\u0780\u07aa\u0782\u07b0\u0782\u07a6\u0781\u07b0 \u078d\u07a8\u0784\u07a8\u078e\u07ac\u0782\u07b0\u0788\u07ac\u0787\u07ac\u0788\u07ac. \u0787\u07a6\u078b\u07a8 \u0787\u07ac\u0786\u07a6\u0786\u07aa \u0787\u07a6\u0782\u07ac\u0786\u07a6\u0786\u07a7\u0789\u07ac\u078b\u07aa \u0787\u07ac\u0789\u07a9\u0780\u07aa\u0782\u07b0 \u0789\u07aa\u07a2\u07a7\u0789\u07a6\u078d\u07a7\u078c\u07b0 \u0786\u07aa\u0783\u07a6\u0782\u07b0\u0788\u07a7\u0782\u07a9\u060c \u0787\u07aa\u079a\u07aa\u0787\u07b0\u0788\u07a6\u078c\u07b0\u078c\u07ac\u0783\u07a8\u0786\u07a6\u0789\u07aa\u078e\u07ac \u0783\u07ab\u0799\u07ac\u0787\u07b0\u078e\u07a6\u0787\u07ac\u0788\u07ac.", + "text": "Maldivian ̃€̃ª̃ƒ̃¨̃€̃§ ̃‡̃¨̃‚̃°̃̃§̃‚̃ª̃‚̃°̃ˆ̃¬̃̃° ̃‹̃ª̃‚̃¨̃”̃¬̃‡̃¦̃̃° ̃‡̃ª̃̃¦̃‚̃°̃ˆ̃¦̃‚̃©ØŒ ̃‰̃¨̃‚̃¨̃ˆ̃¦̃‚̃°̃†̃¦̃‰̃ª̃̃¦̃‡̃¨ØŒ ̃€̃¦̃‰̃¦̃€̃¦̃‰̃¦ ̃™̃¦̃‡̃°̃¤̃ª̃Œ̃¦̃†̃¦̃†̃§̃‡̃¬̃†̃ªØŒ ̃€̃¦̃‰̃¦̃€̃¦̃‰̃¦ ̃‹̃¦̃ƒ̃¦̃–̃¦̃‡̃¬̃‡̃°̃̃¦̃‡̃¨ ̃†̃¦̃‰̃­̃€̃¨̃Œ̃¬̃ˆ̃¨̃̃¬̃‚̃°̃ˆ̃§ ̃„̃¦̃‡̃¬̃‡̃°̃̃¬ ̃̃®̃Œ̃ª̃̃¦̃‡̃¬̃ˆ̃¬. ̃€̃¬̃”̃® ̃ˆ̃¨̃̃°̃‚̃ª̃‰̃§̃‡̃¨ØŒ ̃€̃¬̃”̃®̃„̃ª̃‡̃°̃‹̃©̃̃¬ ̃„̃§̃ƒ̃ª ̃‡̃¬̃‰̃©̃€̃ª̃‚̃°̃‚̃¦̃̃° ̃̃¨̃„̃¨̃̃¬̃‚̃°̃ˆ̃¬̃‡̃¬̃ˆ̃¬. ̃‡̃¦̃‹̃¨ ̃‡̃¬̃†̃¦̃†̃ª ̃‡̃¦̃‚̃¬̃†̃¦̃†̃§̃‰̃¬̃‹̃ª ̃‡̃¬̃‰̃©̃€̃ª̃‚̃° ̃‰̃ª̃¢̃§̃‰̃¦̃̃§̃Œ̃° ̃†̃ª̃ƒ̃¦̃‚̃°̃ˆ̃§̃‚̃©ØŒ ̃‡̃ª̃̃ª̃‡̃°̃ˆ̃¦̃Œ̃°̃Œ̃¬̃ƒ̃¨̃†̃¦̃‰̃ª̃̃¬ ̃ƒ̃«̃™̃¬̃‡̃°̃̃¦̃‡̃¬̃ˆ̃¬.", "metadata": { "languages": [ "ara" @@ -6051,7 +6051,7 @@ { "type": "NarrativeText", "element_id": "c3f212c4f2a219b94139b577bd336587", - "text": "Maltese Il-bnedmin kollha jitwieldu \u0127ielsa u ugwali fid-dinjit\u00e0 u d-drittijiet. Huma mog\u0127nija bir-ra\u0121uni u bil-kuxjenza u g\u0127andhom i\u0121ibu ru\u0127hom ma\u2019 xulxin bi spirtu ta\u2019 a\u0127wa.", + "text": "Maltese Il-bnedmin kollha jitwieldu ħielsa u ugwali fid-dinjitĂ  u d-drittijiet. Huma mogħnija bir-raÄ¡uni u bil-kuxjenza u għandhom iÄ¡ibu ruħhom ma’ xulxin bi spirtu ta’ aħwa.", "metadata": { "languages": [ "hrv", @@ -6094,7 +6094,7 @@ { "type": "NarrativeText", "element_id": "53014d120e3ef288a2152a64e8cc5fae", - "text": "Maninkakan, Eastern Adamadennu b\u025b\u025b s\u0254d\u0254n\u0272a kakan, h\u0254r\u0254ya d\u0254, fabaden\u0272a d\u0254 ani sariya ta fan d\u0254. Hankili ni s\u0254n\u0254m\u025b ye alu b\u025b\u025b ma, a kakan wo d\u0254 alu ye bakelen\u0272a sila lataaman alu \u0272\u0254\u0254n t\u025b.", + "text": "Maninkakan, Eastern Adamadennu bɛɛ sÉ”dÉ”nɲa kakan, hÉ”rÉ”ya dÉ”, fabadenɲa dÉ” ani sariya ta fan dÉ”. Hankili ni sÉ”nÉ”mÉ› ye alu bɛɛ ma, a kakan wo dÉ” alu ye bakelenɲa sila lataaman alu ɲɔɔn tÉ›.", "metadata": { "languages": [ "ind", @@ -6160,7 +6160,7 @@ { "type": "UncategorizedText", "element_id": "dae3f973f6bbdd3401ce4aa3e297b361", - "text": "Mapudungun Kom pu mogence kisuzuam mvlekey, kom cegeygvn, logkogeygvn ka piwkegeygvn, nieygvn kimvn fey mew mvley ta\u00f1i yamniewael ka epu\u00f1pvle kejuwael egvn.", + "text": "Mapudungun Kom pu mogence kisuzuam mvlekey, kom cegeygvn, logkogeygvn ka piwkegeygvn, nieygvn kimvn fey mew mvley tañi yamniewael ka epuñpvle kejuwael egvn.", "metadata": { "languages": [ "ind", @@ -6183,7 +6183,7 @@ { "type": "NarrativeText", "element_id": "ecca335c6a309f063e4df0ad38eecd27", - "text": "Marathi \u0938\u0930\u094d\u0935 \u092e\u093e\u0928\u0935\u0940 \u0935\u094d\u092f\u0915\u094d\u0924\u093f \u091c\u0928\u094d\u092e\u0924\u0903\u091a \u0938\u094d\u0935\u0924\u0902\u0924\u094d\u0930 \u0906\u0939\u0947\u0924 \u0935 \u0924\u094d\u092f\u093e\u0902\u0928\u093e \u0938\u092e\u093e\u0928 \u092a\u094d\u0930\u0924\u093f\u0937\u094d\u0920\u093e \u0935 \u0938\u092e\u093e\u0928 \u0905\u0927\u093f\u0915\u093e\u0930 \u0906\u0939\u0947\u0924. \u0924\u094d\u092f\u093e\u0902\u0928\u093e \u0935\u093f\u091a\u093e\u0930\u0936\u0915\u094d\u0924\u093f \u0935 \u0938\u0926\u0938\u0935\u093f\u0926\u094d\u0935\u0947\u0915\u092c\u0941\u0926\u094d\u0927\u093f \u0932\u093e\u092d\u0932\u0947\u0932\u0940 \u0906\u0939\u0947. \u0935 \u0924\u094d\u092f\u093e\u0902\u0928\u0940 \u090f\u0915\u092e\u0947\u0915\u093e\u0902\u0936\u0940 \u092c\u0902\u0927\u0941\u0924\u094d\u092f\u093e\u091a\u094d\u092f\u093e \u092d\u093e\u0935\u0928\u0947\u0928\u0947 \u0906\u091a\u0930\u0923 \u0915\u0930\u093e\u0935\u0947.", + "text": "Marathi सरà¥à¤µ मानवी वà¥à¤¯à¤•à¥à¤¤à¤¿ जनà¥à¤®à¤¤à¤ƒà¤ सà¥à¤µà¤¤à¤‚तà¥à¤° आहेत व तà¥à¤¯à¤¾à¤‚ना समान पà¥à¤°à¤¤à¤¿à¤·à¥à¤ à¤¾ व समान अधिकार आहेत. तà¥à¤¯à¤¾à¤‚ना विà¤à¤¾à¤°à¤¶à¤•à¥à¤¤à¤¿ व सदसविदà¥à¤µà¥‡à¤•बà¥à¤¦à¥à¤§à¤¿ लाभलेली आहे. व तà¥à¤¯à¤¾à¤‚नी à¤à¤•मेकांशी बंधà¥à¤¤à¥à¤¯à¤¾à¤à¥à¤¯à¤¾ भावनेने आà¤à¤°à¤£ करावे.", "metadata": { "languages": [ "mar" @@ -6225,7 +6225,7 @@ { "type": "NarrativeText", "element_id": "3a69fb7fe5d36459edf30ffa8f0fb0bc", - "text": "Mats\u00e9s Chidon tishaido yec matses abitedimbo b\u00ebdamboec isnanac b\u00ebdambo ictsiash. Chieshnanac icsambo ictsiash. Abitedimbo b\u00ebdamboec tabadac b\u00ebdambo ictsiash. Shubu abents\u00ebcquid\u00ebn tabadac birnboec abitedi tabadac b\u00ebdambo ictsiash - quequin chuipan\u00ebdash nidaid abitedino\u00ebsh cho-choquidon.", + "text": "MatsĂ©s Chidon tishaido yec matses abitedimbo bĂ«damboec isnanac bĂ«dambo ictsiash. Chieshnanac icsambo ictsiash. Abitedimbo bĂ«damboec tabadac bĂ«dambo ictsiash. Shubu abentsĂ«cquidĂ«n tabadac birnboec abitedi tabadac bĂ«dambo ictsiash - quequin chuipanĂ«dash nidaid abitedinoĂ«sh cho-choquidon.", "metadata": { "languages": [ "eng", @@ -6247,7 +6247,7 @@ { "type": "NarrativeText", "element_id": "9c3467ac29002d9da69f15b063e13924", - "text": "Maya, Yucat\u00e1n Tul\u00e1akal w\u00edinik ku s\u00edijil j\u00e1alk\u02bcab yetel keet u tsiikul yetel Najmal Sijnalil, beytun xan na\u02bcata\u02bcan sijnalil yetel no\u02bcoja\u02bcanil u tuukulo\u02bc, k\u02bca\u02bcabet u bisikuba bey l\u00e1aktzilil yetel tul\u00e1akal u baatzile\u02bc.", + "text": "Maya, YucatĂ¡n TulĂ¡akal wĂ­inik ku sĂ­ijil jĂ¡alkʼab yetel keet u tsiikul yetel Najmal Sijnalil, beytun xan naʼataʼan sijnalil yetel noʼojaʼanil u tuukuloʼ, kʼaʼabet u bisikuba bey lĂ¡aktzilil yetel tulĂ¡akal u baatzileʼ.", "metadata": { "languages": [ "hun", @@ -6271,7 +6271,7 @@ { "type": "UncategorizedText", "element_id": "7947c1a7d2c92cd1fea5311d4d9241ba", - "text": "Mazahua Central Texe yo nte\u0331'e\u0331 chjetrjoji, angezeji ximi xo'oji \u00f1eje k'inchiji, nesta ra ngara na jo'o k'o dyaja e nte\u0331'e\u0331.", + "text": "Mazahua Central Texe yo ntè±'è± chjetrjoji, angezeji ximi xo'oji ñeje k'inchiji, nesta ra ngara na jo'o k'o dyaja e ntè±'è±.", "metadata": { "languages": [ "hrv", @@ -6294,7 +6294,7 @@ { "type": "NarrativeText", "element_id": "ded8e8298bf9edcaae477d35c01be283", - "text": "Mazatec, Ixcatl\u00e1n Nga ndindie xuta ngatsen de\u2019e ko ngondsejen ngatjin-kjua nga xchandinkon nt\u2019a ngondsejen ngatjin kokjin-tokon,kotjinkjua nga takie engajan skuendinkon xkjin.", + "text": "Mazatec, IxcatlĂ¡n Nga ndindie xuta ngatsen de’e ko ngondsejen ngatjin-kjua nga xchandinkon nt’a ngondsejen ngatjin kokjin-tokon,kotjinkjua nga takie engajan skuendinkon xkjin.", "metadata": { "languages": [ "sqi", @@ -6337,7 +6337,7 @@ { "type": "NarrativeText", "element_id": "407b0080d05f944ba83f5c3e722bde13", - "text": "Mbundu (009) Mutu uoso uoso a mu vuala ni ufolo ni kutena kumoxi mu kijingu ni mu ubinganu. Mu kilembu kia kubanga ni mu ubanzelu, Atena u\u00ea kubanga ioso kua akua mu muxima ua tululuka mba upange.", + "text": "Mbundu (009) Mutu uoso uoso a mu vuala ni ufolo ni kutena kumoxi mu kijingu ni mu ubinganu. Mu kilembu kia kubanga ni mu ubanzelu, Atena uĂª kubanga ioso kua akua mu muxima ua tululuka mba upange.", "metadata": { "languages": [ "swa" @@ -6358,7 +6358,7 @@ { "type": "NarrativeText", "element_id": "d76da3518499aeb0e43b4c133556d135", - "text": "Mende Numuvuisia Kp\u025bl\u025b\u025b ta ti le t\u025b y\u025b nduw\u0254 ya hu, tao ti nuvuu yei k\u025b\u025b ti l\u0254nyi maa h\u025bwung\u0254. Kiiya k\u025b\u025b hindaluahu g\u0254\u0254la a y\u025bl\u0254 ti hun. Fale mahoung\u0254 ti ti ny\u0254ny\u0254hu hoi kia ndeegaa.", + "text": "Mende Numuvuisia KpÉ›lɛɛ ta ti le tÉ› yÉ› nduwÉ” ya hu, tao ti nuvuu yei kɛɛ ti lÉ”nyi maa hÉ›wungÉ”. Kiiya kɛɛ hindaluahu gɔɔla a yÉ›lÉ” ti hun. Fale mahoungÉ” ti ti nyÉ”nyÉ”hu hoi kia ndeegaa.", "metadata": { "languages": [ "swa", @@ -6380,7 +6380,7 @@ { "type": "NarrativeText", "element_id": "ac3c7d9dea662f8ba1dfb383045ce903", - "text": "Micmac Msit mimajulnu\u2019k weskwijinu\u2019ltijik alsumsultijik aqq newte\u2019 tett wkpimte\u2019tmut aqq koqwajo\u2019taqnn wejkul\u2019aqmititl.", + "text": "Micmac Msit mimajulnu’k weskwijinu’ltijik alsumsultijik aqq newte’ tett wkpimte’tmut aqq koqwajo’taqnn wejkul’aqmititl.", "metadata": { "languages": [ "est", @@ -6425,7 +6425,7 @@ { "type": "NarrativeText", "element_id": "208949d3fb140dd9413f78a99feda832", - "text": "M\u00edskito Upla sut ba kulkanka lakara, airaitka nanira bara pri, sin, aikuki, baku takisa. Bamna sins laka bri baku, lukanka bain pri baku aimuihni lakara, pana pana tabaikan kaiasa.", + "text": "MĂ­skito Upla sut ba kulkanka lakara, airaitka nanira bara pri, sin, aikuki, baku takisa. Bamna sins laka bri baku, lukanka bain pri baku aimuihni lakara, pana pana tabaikan kaiasa.", "metadata": { "languages": [ "ind" @@ -6446,7 +6446,7 @@ { "type": "NarrativeText", "element_id": "db840a4da82f82310ee839cd22112f22", - "text": "Mixe, Totontepec Tum akijpxa xa ve\u2019e jayu kye\u2019ex, ve\u2019em ax j\u00f6\u2019n tyukidaakj\u00fcva tijaty m\u00ebkin; ve\u2019empa axj\u00f6\u2019n j\u00e4 jy\u00f6\u00f6jtykin di yaknaxy, jats oy myujaty\u00f6\u00f6\u2019t\u00ebjk di m\u00eb\u00ebt nayjavaj\u00fct.", + "text": "Mixe, Totontepec Tum akijpxa xa ve’e jayu kye’ex, ve’em ax jö’n tyukidaakjĂ¼va tijaty mĂ«kin; ve’empa axjö’n jä jyööjtykin di yaknaxy, jats oy myujatyöö’tĂ«jk di mëët nayjavajĂ¼t.", "metadata": { "languages": [ "fin" @@ -6467,7 +6467,7 @@ { "type": "NarrativeText", "element_id": "bbe9fa33187b976f4032c34c6ca2fabf", - "text": "Mixtec, Metlat\u00f3noc Taka ma \u00f1ayi nguiakoi \u00f1ayivi \u00f1atu na ja'a tnu'u ja kusa'a ndeva'\u00f1a-i, su'uva kajito va'a\u00f1a-i, yuka ku ja jini\u00f1u'u ja kukototna-i.", + "text": "Mixtec, MetlatĂ³noc Taka ma ñayi nguiakoi ñayivi ñatu na ja'a tnu'u ja kusa'a ndeva'ña-i, su'uva kajito va'aña-i, yuka ku ja jiniñu'u ja kukototna-i.", "metadata": { "languages": [ "hrv", @@ -6489,7 +6489,7 @@ { "type": "NarrativeText", "element_id": "03b6cefe8d16c5c896f974b268a52302", - "text": "Mizo Mi zawng zawng hi zal\u00eana piang kan ni a, zahawmna leh dikna chanvoah intluk tl\u00e2ng vek kan ni. Chhia leh tha hriatna f\u00eem neia siam kan nih avangin kan mihring puite chungah inunauna thinlung kan pu tlat tur a ni.", + "text": "Mizo Mi zawng zawng hi zalĂªna piang kan ni a, zahawmna leh dikna chanvoah intluk tlĂ¢ng vek kan ni. Chhia leh tha hriatna fĂ®m neia siam kan nih avangin kan mihring puite chungah inunauna thinlung kan pu tlat tur a ni.", "metadata": { "languages": [ "ind", @@ -6512,7 +6512,7 @@ { "type": "NarrativeText", "element_id": "ec6cdd4d644ddfaafbb05d9216ebbd7c", - "text": "Moba Nifoi kul maal yendu buam po i, k b yudand yen b yiko-nba bi\u025b ja. B m\u0254g maalm g ban yal g \u014ban, g bi\u025b baa bu yen lieb naataann n nin\u014b i.", + "text": "Moba Nifoi kul maal yendu buam po i, k b yudand yen b yiko-nba biÉ› ja. B mÉ”g maalm g ban yal g Å‹an, g biÉ› baa bu yen lieb naataann n ninÅ‹ i.", "metadata": { "languages": [ "ind", @@ -6535,7 +6535,7 @@ { "type": "UncategorizedText", "element_id": "0d21e19f00c8cb7264e83c01c0f02161", - "text": "Mon \u1019\u105e\u102d\u101f\u103a\u1002\u1019\u1060\u102d\u102f\u105a\u103a \u1021\u102d\u102f\u103f\u102e\u102f\u1010\u1021\u103a\u101d\u103d\u1036 \u1005\u1014\u1030\u101e\u1060\u1038\u1010\u102d\u1010\u103a \u1014\u1030\u1002\u101d\u103a\u1002\u105e\u1034 \u1012\u103e\u103a\u1019\u105e\u102d\u101f\u103a\u101e\u1060\u1038\u1015\u103d\u1038\u1021\u102d\u102f\u1010\u103a\u1010\u102f\u1032 \u1021\u1001\u1031\u102b\u105a\u103a\u1021\u101b\u102c \u1000\u1031\u102f\u102c\u1036 \u101e\u102d\u1000\u1039\u1001\u102c\u1019\u105e\u102d\u101f\u103a\u1010\u1021\u103a \u1010\u102f\u1015\u103a \u101e\u105f\u101f\u103a\u101b\u104b \u1019\u105e\u102d\u101f\u103a\u1010\u1021\u103a\u1002\u103e\u103a \u1014\u103d\u1036\u1000\u1035\u102f\u1013\u101b\u103a\u1005\u105a\u103a\u1001\u103c\u105a\u103a\u1000\u1031\u102f\u102c\u1036 \u101e\u1019\u1039\u1010\u102e\u100a\u102c\u100f\u103a \u1013\u101d\u103a\u1015\u102b\u103a\u1015\u1032\u102b \u1001\u102d\u102f\u101f\u103a\u1015\u101b\u1031\u1036\u1014\u103d\u1036\u1010\u102f\u1032 \u100a\u1038\u1019\u103d\u1032 \u1000\u1031\u102f\u102c\u1036 \u100a\u1038\u1019\u103d\u1032 \u1011\u1031\u1000\u103a\u1000\u1035\u102f \u101e\u1039\u1012\u1038\u1012\u1039\u1002\u1031\u1010\u103a\u1017\u1000\u103a \u1006\u1000\u103a\u1006\u1031\u102c\u1036\u100a\u1038\u101e\u1039\u1000\u1021\u103a \u1014\u1005\u102d\u102f\u1010\u103a\u1013\u102c\u1010\u103a\u1000\u1031\u102c\u1036\u1012\u1031\u1036\u1021\u101b\u1031\u104b", + "text": "Mon မáိဟ်ဂမá á€­á€¯á် အိုဿီုá€á€¡á€ºá€á€½á€¶ စနူá€á á€¸á€á€­á€á€º နူဂá€á€ºá€‚áဴ ဒှ်မáိဟ်á€á á€¸á€•ွးအိုá€á€ºá€á€¯á€² အá€á€±á€«á်အရာ ကေုာံ á€á€­á€€á€¹á€á€¬á€™áိဟ်á€á€¡á€º á€á€¯á€•် á€áŸá€Ÿá€ºá€›á‹ မáိဟ်á€á€¡á€ºá€‚ှ် နွံကဵုဓရ်စá်á€á€¼á်ကေုာံ á€á€™á€¹á€á€®á€á€¬á€á€º ဓá€á€ºá€•ါ်ပဲါ á€á€­á€¯á€Ÿá€ºá€•ရေံနွံá€á€¯á€² á€á€¸á€™á€½á€² ကေုာံ á€á€¸á€™á€½á€² ထေက်ကဵု á€á€¹á€’းဒ္ဂေá€á€ºá€—က် ဆက်ဆောံá€á€¸á€á€¹á€€á€¡á€º နစိုá€á€ºá€“ာá€á€ºá€€á€±á€¬á€¶á€’ေံအရေá‹", "metadata": { "filetype": "text/plain", "data_source": { @@ -6553,7 +6553,7 @@ { "type": "NarrativeText", "element_id": "a36553665277971db5d4c68908f99088", - "text": "Mongolian, Halh (Cyrillic) \u0425\u04af\u043d \u0431\u04af\u0440 \u0442\u04e9\u0440\u0436 \u043c\u044d\u043d\u0434\u043b\u044d\u0445\u044d\u0434 \u044d\u0440\u0445 \u0447\u04e9\u043b\u04e9\u04e9\u0442\u044d\u0439, \u0430\u0434\u0438\u043b\u0445\u0430\u043d \u043d\u044d\u0440 \u0442\u04e9\u0440\u0442\u044d\u0439, \u0438\u0436\u0438\u043b \u044d\u0440\u0445\u0442\u044d\u0439 \u0431\u0430\u0439\u0434\u0430\u0433. \u041e\u044e\u0443\u043d \u0443\u0445\u0430\u0430\u043d, \u043d\u0430\u043d\u0434\u0438\u043d \u0447\u0430\u043d\u0430\u0440 \u0437\u0430\u044f\u0430\u0441\u0430\u043d \u0445\u04af\u043d \u0433\u044d\u0433\u0447 \u04e9\u04e9\u0440 \u0445\u043e\u043e\u0440\u043e\u043d\u0434\u043e\u043e \u0430\u0445\u0430\u043d \u0434\u04af\u04af\u0433\u0438\u0439\u043d \u04af\u0437\u044d\u043b \u0441\u0430\u043d\u0430\u0430\u0433\u0430\u0430\u0440 \u0445\u0430\u0440\u044c\u0446\u0430\u0445 \u0443\u0447\u0438\u0440\u0442\u0430\u0439.", + "text": "Mongolian, Halh (Cyrillic) Đ¥̉¯Đ½ б̉¯Ñ€ Ñ‚Ó©Ñ€Đ¶ Đ¼ÑĐ½Đ´Đ»ÑÑ…ÑĐ´ Ñрх Ñ‡Ó©Đ»Ó©Ó©Ñ‚ÑĐ¹, Đ°Đ´Đ¸Đ»Ñ…Đ°Đ½ Đ½ÑÑ€ төртÑĐ¹, ижил ÑрхтÑĐ¹ Đ±Đ°Đ¹Đ´Đ°Đ³. ĐÑÑƒĐ½ ÑƒÑ…Đ°Đ°Đ½, Đ½Đ°Đ½Đ´Đ¸Đ½ Ñ‡Đ°Đ½Đ°Ñ€ заÑаÑĐ°Đ½ Ñ…̉¯Đ½ Đ³ÑĐ³Ñ‡ Ó©Ó©Ñ€ Ñ…Đ¾Đ¾Ñ€Đ¾Đ½Đ´Đ¾Đ¾ Đ°Ñ…Đ°Đ½ Đ´̉¯̉¯Đ³Đ¸Đ¹Đ½ ̉¯Đ·ÑĐ» ÑĐ°Đ½Đ°Đ°Đ³Đ°Đ°Ñ€ Ñ…Đ°Ñ€ÑŒÑ†Đ°Ñ… ÑƒÑ‡Đ¸Ñ€Ñ‚Đ°Đ¹.", "metadata": { "languages": [ "rus" @@ -6595,7 +6595,7 @@ { "type": "UncategorizedText", "element_id": "ffd087e56c47b9405e77d2f08dca7d1e", - "text": "\u182c\u1826\u182e\u1826\u1828 \u182a\u1826\u1837 \u1832\u1825\u1837\u1825\u1835\u1826 \u182e\u1821\u1828\u1833\u1821\u182f\u1821\u182c\u1826 \u1821\u1837\u182c\u1821 \u1834\u1822\u182f\u1825\u182d\u1821\u202f\u1832\u1821\u1822\u1802 \u1820\u1833\u1820\u182f\u1822\u182c\u1820\u1828 \u1828\u1821\u1837\u180e\u1821 \u1832\u1825\u1837\u1825\u202f\u1832\u1821\u1822\u1802 \u1822\u1835\u1822\u182f \u1821\u1837\u182c\u1821\u202f\u1832\u1821\u1822 \u182a\u1820\u1822\u1820\u182d\u1803 \u1823\u1836\u1824\u1828 \u1824\u182c\u1820\u182d\u1820\u1828\u1802 \u1828\u1820\u1828\u1833\u1822\u1828 \u1834\u1822\u1828\u1820\u1837 \u1835\u1820\u1836\u1820\u182d\u1820\u1830\u1820\u1828 \u182c\u1826\u182e\u1826\u1828 \u182c\u1821\u182d\u1834\u1822 \u1825\u182d\u1821\u1837\u180e\u1821 \u182c\u1823\u182d\u1823\u1837\u1823\u1828\u1833\u1823\u180e\u1828 \u1820\u182c\u1820\u1828 \u1833\u1821\u182d\u1826\u1826\u202f\u1822\u1828 \u1826\u1835\u1822\u182f \u1830\u1820\u1828\u1820\u182d\u1820\u202f\u1825\u1820\u1837 \u182c\u1820\u1837\u1822\u1834\u1820\u182c\u1825 \u1824\u1834\u1822\u1837\u202f\u1832\u1820\u1822\u1803", + "text": "ᠬᠦᠮᠦᠨ ᠪᠦᠷ ᠲᠥᠷᠥᠵᠦ ᠮᠡᠨᠳᠡᠯᠡᠬᠦ ᠡᠷᠬᠡ ᠴᠢᠯᠥᠭᠡ ᠲᠡᠢ᠂ ᠠᠳᠠᠯᠢᠬᠠᠨ ᠨᠡᠷá á ¡ ᠲᠥᠷᠥ ᠲᠡᠢ᠂ ᠢᠵᠢᠯ ᠡᠷᠬᠡ ᠲᠡᠢ ᠪᠠᠢᠠᠭ᠃ ᠣᠶᠤᠨ ᠤᠬᠠᠭᠠᠨ᠂ ᠨᠠᠨᠳᠢᠨ ᠴᠢᠨᠠᠷ ᠵᠠᠶᠠᠭᠠᠰᠠᠨ ᠬᠦᠮᠦᠨ ᠬᠡᠭᠴᠢ ᠥᠭᠡᠷá á ¡ ᠬᠣᠭᠣᠷᠣᠨᠳᠣá á ¨ ᠠᠬᠠᠨ ᠳᠡᠭᠦᠦ ᠢᠨ ᠦᠵᠢᠯ ᠰᠠᠨᠠᠭᠠ ᠥᠠᠷ ᠬᠠᠷᠢᠴᠠᠬᠥ ᠤᠴᠢᠷ ᠲᠠᠢ᠃", "metadata": { "filetype": "text/plain", "data_source": { @@ -6613,7 +6613,7 @@ { "type": "NarrativeText", "element_id": "3d0a59b543e077c2f0c391add9b38a89", - "text": "Montenegrin Sva ljudska bi\u0107a ra\u0111aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i savje\u0161\u0107u i jedni prema drugima treba da postupaju u duhu bratstva.", + "text": "Montenegrin Sva ljudska bića raÄ‘aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i savješću i jedni prema drugima treba da postupaju u duhu bratstva.", "metadata": { "languages": [ "hrv" @@ -6634,7 +6634,7 @@ { "type": "NarrativeText", "element_id": "86eff2400c116e5d00b9f1b3e17e0d7f", - "text": "M\u00f2or\u00e9 Ninsaalb\u00e3 f\u00e3a s\u00e3 n doge, ned f\u00e3a so a menga, ned pa rogd n yaa yamb ye, neb\u00e3 f\u00e3a zema taab b yel-segd\u0269 la b burk\u0129ndlem w\u025b\u025bnge\u0303. Neb\u00e3 f\u00e3a tara yam la tagsgo, ned f\u00e3a togame n v\u0269\u0269nd ne a to saam-biir p\u028age\u0303.", + "text": "MĂ²orĂ© NinsaalbĂ£ fĂ£a sĂ£ n doge, ned fĂ£a so a menga, ned pa rogd n yaa yamb ye, nebĂ£ fĂ£a zema taab b yel-segdÉ© la b burkÄ©ndlem wɛɛngèƒ. NebĂ£ fĂ£a tara yam la tagsgo, ned fĂ£a togame n vɩɩnd ne a to saam-biir pÊgèƒ.", "metadata": { "languages": [ "som", @@ -6657,7 +6657,7 @@ { "type": "NarrativeText", "element_id": "91eb2842523b8e930ee6199a0098fa14", - "text": "Moro Le\u0111a pre\u0111 lal\u01dd\u014b\u01ddnia l\u00ebb\u01ddr\u00ebinialo na l\u01dd\u027d\u01ddwa\u1e6fo e\u014ben \u014b\u01dd\u0111amia na e\u014ben pre\u0111 i\u014bi \u014b\u01ddrca\u0111a\u1e6fo \u1e6fa le\u0111a al\u01ddfi\u0111i. L\u00ebn\u014bulu pre\u0111 lanan\u00ebinu \u0111\u01ddnaca \u0111ame \u027det\u01dd\u027deto na ara g\u01dd\u014b\u01ddra \u014ben\u014ban\u1e6fa al\u01dd\u027d\u01ddwa\u0111a\u1e6fe alam\u01dd\u0111ai\u0111e b\u01dd\u027dan usilaga g\u01dd\u014b\u01ddl\u01dd\u014b\u01ddnia na g\u01dd\u014borba.", + "text": "Moro LeÄ‘a preÄ‘ lalÇÅ‹Çnia lĂ«bÇrĂ«inialo na lÇɽÇwaṯo eÅ‹en Å‹ÇÄ‘amia na eÅ‹en preÄ‘ iÅ‹i Å‹ÇrcaÄ‘aṯo ṯa leÄ‘a alÇfiÄ‘i. LĂ«nÅ‹ulu preÄ‘ lananĂ«inu Ä‘Çnaca Ä‘ame ɽetÇɽeto na ara gÇÅ‹Çra Å‹enÅ‹anṯa alÇɽÇwaÄ‘aṯe alamÇÄ‘aiÄ‘e bÇɽan usilaga gÇÅ‹ÇlÇÅ‹Çnia na gÇÅ‹orba.", "metadata": { "languages": [ "hrv" @@ -6700,7 +6700,7 @@ { "type": "NarrativeText", "element_id": "25ab4cdce4c3199b55a4bd49864e981b", - "text": "Naga, Ao Meimchir ajak temeten aser tashi kasa n\u00fcji nung asor. Parnok dak bilemtetts\u00fc shisats\u00fc aser tangatetba kasa ag\u00fcja aliba jagi k\u00fclem adianu rongnung tanela ka nung lungjema alits\u00fcla.", + "text": "Naga, Ao Meimchir ajak temeten aser tashi kasa nĂ¼ji nung asor. Parnok dak bilemtettsĂ¼ shisatsĂ¼ aser tangatetba kasa agĂ¼ja aliba jagi kĂ¼lem adianu rongnung tanela ka nung lungjema alitsĂ¼la.", "metadata": { "languages": [ "ind", @@ -6744,7 +6744,7 @@ { "type": "NarrativeText", "element_id": "9376ea8b7100165bb8bd466c00f5bdcc", - "text": "Nanai \u0425\u044d\u043c\u0442\u0443 \u043d\u0430\u0438\u0306\u0441\u0430\u043b \u0433\u0438\u043f\u0430\u043b\u0438\u043d, \u043c\u044d\u043d\u044d \u0433\u044d\u0431\u0443\u0434\u0438\u044d\u0440\u0438, \u043f\u0440\u0430\u0432\u043e\u0441\u0430\u043b\u0434\u0438\u0430\u0440\u0438 \u044d\u043c\u0443\u0442\u0443 \u0431\u0430\u043b\u0434\u0438\u0447\u0438. \u041d\u0435\u0308\u0430\u043d\u0447\u0438 \u043c\u0443\u0440\u0443\u04c8\u043a\u0443, \u0434\u044d\u0440\u044d\u043b\u043a\u0443, \u0434\u0438\u0430 \u0434\u0438\u0430\u0432\u0430\u0440\u0438 \u0430-\u043d\u044d\u0443-\u043c\u044d\u0442 \u0431\u043e\u0434\u043e\u043c\u0430\u0440\u0438 \u0442\u0430\u0433\u0438\u043b\u0430\u0438\u0306\u0447\u0438.", + "text": "Nanai Đ¥ÑĐ¼Ñ‚Ñƒ Đ½Đ°Đ¸̀†Ñал Đ³Đ¸Đ¿Đ°Đ»Đ¸Đ½, Đ¼ÑĐ½Ñ Đ³ÑĐ±ÑƒĐ´Đ¸ÑÑ€Đ¸, Đ¿Ñ€Đ°Đ²Đ¾ÑĐ°Đ»Đ´Đ¸Đ°Ñ€Đ¸ ÑĐ¼ÑƒÑ‚Ñƒ Đ±Đ°Đ»Đ´Đ¸Ñ‡Đ¸. ĐѐˆĐ°Đ½Ñ‡Đ¸ Đ¼ÑƒÑ€ÑƒÓˆĐºÑƒ, Đ´ÑÑ€ÑĐ»ĐºÑƒ, диа Đ´Đ¸Đ°Đ²Đ°Ñ€Đ¸ а-Đ½Ñу-Đ¼ÑÑ‚ Đ±Đ¾Đ´Đ¾Đ¼Đ°Ñ€Đ¸ Ñ‚Đ°Đ³Đ¸Đ»Đ°Đ¸̀†Ñ‡Đ¸.", "metadata": { "languages": [ "rus" @@ -6765,7 +6765,7 @@ { "type": "NarrativeText", "element_id": "201308d749f47555d03c5087f304457b", - "text": "Navajo Bila\u02bcashda\u02bcii t\u02bc\u00e1\u00e1 a\u0142tsoh yin\u00edk\u02bcehgo bidizhch\u012fh d\u00f3\u00f3 ahee\u0142t\u02bceego \u00edl\u012f\u0301\u012f\u0301go bee baah\u00f3ch\u012f\u02bc. E\u00ed\u00ed h\u00e1n\u00ed\u02bc d\u00f3\u00f3 h\u00e1n\u00edtshakees hwiihdaasya\u02bc e\u00ed\u00ed binahj\u012f\u0301\u02bc ahidin\u00ed\u0142n\u00e1hgo \u00e1l\u00edleek\u02bcehgo k\u02bc\u00e9 bee ahi\u0142 niidl\u012f\u0301.", + "text": "Navajo Bilaʼashdaʼii tÊ¼Ă¡Ă¡ aÅ‚tsoh yinĂ­kʼehgo bidizhchįh dĂ³Ă³ aheeÅ‚tʼeego Ă­lį̀į̀go bee baahĂ³chįʼ. EĂ­Ă­ hĂ¡nĂ­Ê¼ dĂ³Ă³ hĂ¡nĂ­tshakees hwiihdaasyaʼ eĂ­Ă­ binahjį̀ʼ ahidinĂ­Å‚nĂ¡hgo Ă¡lĂ­leekʼehgo kÊ¼Ă© bee ahiÅ‚ niidlį̀.", "metadata": { "languages": [ "som", @@ -6831,7 +6831,7 @@ { "type": "NarrativeText", "element_id": "a0cad811bb49185b6fdb66fb2060c59a", - "text": "Nenets \u0415\u0442 \u0445\u0438\u0431\u044f\u0440\u0438 \u043d\u0435\u043d\u044d\u0446\u044c \u0441\u043e\u044f\u043c\u0430\u0440\u0438\u0430\u043d\u0442\u0430 \u0445\u0443\u0440\u043a\u0430\u0440\u0438 \u043f\u0440\u0430\u0432\u0430\u0434\u0430 \u0442\u043d\u044f\u0432\u0430, \u04c8\u043e\u0431\u043e\u0439 \u043d\u0435\u043d\u044d\u0446\u044f \u043d\u0438\u0434\u0443 \u043d\u0438\u0441\u044c \u0442\u043e\u043a\u0430\u043b\u0431\u0430, \u04c8\u044b\u0431\u0442\u0430\u043c\u0431\u0430 \u0438\u043b\u0435\u0432\u0430\u0442\u0443 \u0442\u0430\u0440\u0430.", + "text": "Nenets Đ•Ñ‚ Ñ…Đ¸Đ±ÑÑ€Đ¸ Đ½ĐµĐ½Ñць ÑĐ¾ÑĐ¼Đ°Ñ€Đ¸Đ°Đ½Ñ‚Đ° Ñ…ÑƒÑ€ĐºĐ°Ñ€Đ¸ Đ¿Ñ€Đ°Đ²Đ°Đ´Đ° Ñ‚Đ½ÑĐ²Đ°, ÓˆĐ¾Đ±Đ¾Đ¹ Đ½ĐµĐ½ÑÑ†Ñ Đ½Đ¸Đ´Ñƒ Đ½Đ¸ÑÑŒ Ñ‚Đ¾ĐºĐ°Đ»Đ±Đ°, ÓˆÑ‹Đ±Ñ‚Đ°Đ¼Đ±Đ° Đ¸Đ»ĐµĐ²Đ°Ñ‚Ñƒ Ñ‚Đ°Ñ€Đ°.", "metadata": { "languages": [ "rus", @@ -6853,7 +6853,7 @@ { "type": "UncategorizedText", "element_id": "80851f8727cbd5baeb6611ada10ff1f9", - "text": "Nepali \u0938\u092c\u0948 \u0935\u094d\u092f\u0915\u094d\u0924\u093f \u0939\u0930\u0942 \u091c\u0928\u094d\u092e\u091c\u093e\u0924 \u0938\u094d\u0935\u0924\u0928\u094d\u0924\u094d\u0930 \u0939\u0941\u0928 \u0924\u0940 \u0938\u092c\u0948\u0915\u094b \u0938\u092e\u093e\u0928 \u0905\u0927\u093f\u0915\u093e\u0930 \u0930 \u092e\u0939\u0924\u094d\u0935 \u091b\u0964 \u0928\u093f\u091c\u0939\u0930\u0942\u092e\u093e \u0935\u093f\u091a\u093e\u0930 \u0936\u0915\u094d\u0924\u093f \u0930 \u0938\u0926\u094d\u0927\u093f\u091a\u093e\u0930 \u092d\u090f\u0915\u094b\u0932\u0947 \u0928\u093f\u091c\u0939\u0930\u0942\u0932\u0947 \u0906\u092a\u0938\u092e\u093e \u092d\u093e\u0924\u0943\u0924\u094d\u0935\u0915\u094b \u092d\u093e\u0935\u0928\u093e \u092c\u093e\u091f \u0935\u094d\u092f\u0935\u0939\u093e\u0930 \u0917\u0930\u094d\u0928\u0941 \u092a\u0930\u094d\u091b\u0964", + "text": "Nepali सबै वà¥à¤¯à¤•à¥à¤¤à¤¿ हरू जनà¥à¤®à¤œà¤¾à¤¤ सà¥à¤µà¤¤à¤¨à¥à¤¤à¥à¤° हà¥à¤¨ ती सबैको समान अधिकार र महतà¥à¤µ छ। निजहरूमा विà¤à¤¾à¤° शकà¥à¤¤à¤¿ र सदà¥à¤§à¤¿à¤à¤¾à¤° भà¤à¤•ोले निजहरूले आपसमा भातृतà¥à¤µà¤•ो भावना बाट वà¥à¤¯à¤µà¤¹à¤¾à¤° गरà¥à¤¨à¥ परà¥à¤›à¥¤", "metadata": { "languages": [ "nep" @@ -6874,7 +6874,7 @@ { "type": "NarrativeText", "element_id": "23ce504c8239c6964f02399ff1fcb1bf", - "text": "Nganasan \u0411\u04d9\u043d\u0434\u0435\u201d \u04c8\u0430\u043d\u0430\u0441\u0430\u043d\u04d9\u201d \u04c8\u04d9\u0442\u0443\u043a\u04d9\u043d\u0434\u044b\u201d \u043d\u0435\u043d\u0434\u044f\u201d\u0442\u0443\u043e\u201d \u04c8\u043e\u043d\u04d9 \u0445\u043e\u043d\u0441\u044b \u0445\u0435\u043b\u0438\u0434\u0435\u201d \u04c8\u0438\u043b\u0435 \u043c\u04d9\u043d\u04d9\u0439 (\u043f\u0440\u0430\u0432\u0430\u0439). \u0421\u044b\u0442\u044b\u04c8 \u0445\u043e\u043d\u0434\u044b\u201d \u04c8\u0438\u043b\u0435 \u04c8\u043e\u043d\u0434\u0430 \u04c8\u043e\u043d\u04d9 \u0441\u044f\u0440\u0443, \u0434\u04af\u0437\u044b\u0442\u04d9\u043d\u0434\u044b\u04c8 \u0438\u0445\u04af\u0442\u04af\u04c8 \u043d\u044f\u0433\u04d9\u04d9\u201d \u0441\u04af\u04e9\u0430\u0440\u0443\u0441\u04d9\u201d.", + "text": "Nganasan Đ‘Ó™Đ½Đ´Đµâ€ ÓˆĐ°Đ½Đ°ÑĐ°Đ½Ó™â€ ÓˆÓ™Ñ‚ÑƒĐºÓ™Đ½Đ´Ñ‹â€ Đ½ĐµĐ½Đ´Ñâ€Ñ‚ÑƒĐ¾â€ ÓˆĐ¾Đ½Ó™ Ñ…Đ¾Đ½ÑÑ‹ Ñ…ĐµĐ»Đ¸Đ´Đµâ€ ÓˆĐ¸Đ»Đµ Đ¼Ó™Đ½Ó™Đ¹ (Đ¿Ñ€Đ°Đ²Đ°Đ¹). Đ¡Ñ‹Ñ‚Ñ‹Óˆ Ñ…Đ¾Đ½Đ´Ñ‹â€ ÓˆĐ¸Đ»Đµ ÓˆĐ¾Đ½Đ´Đ° ÓˆĐ¾Đ½Ó™ ÑÑру, Đ´̉¯Đ·Ñ‹Ñ‚Ó™Đ½Đ´Ñ‹Óˆ Đ¸Ñ…̉¯Ñ‚̉¯Óˆ Đ½ÑĐ³Ó™Ó™â€ Ñ̉¯Ó©Đ°Ñ€ÑƒÑÓ™â€.", "metadata": { "languages": [ "rus" @@ -6917,7 +6917,7 @@ { "type": "NarrativeText", "element_id": "9164d07351a9366edfae5357e2ab807c", - "text": "Nomatsiguenga Antagaisati matsiguenga ibogaigu\u00eb matsiguengasonorl. Aisati icantaigaca. Teni iromerataiguengani. Antagaisati iquengaigui aisati ig\u00f3iguiro ora caninaro aisati ig\u00f3iguiro ora te onganinate. Iroro caninataque omagaro matsiguenga iraniacaninataigueri ira basiniati matsiguenga aisati ingantaiguer\u00ed ora caninaro.", + "text": "Nomatsiguenga Antagaisati matsiguenga ibogaiguĂ« matsiguengasonorl. Aisati icantaigaca. Teni iromerataiguengani. Antagaisati iquengaigui aisati igĂ³iguiro ora caninaro aisati igĂ³iguiro ora te onganinate. Iroro caninataque omagaro matsiguenga iraniacaninataigueri ira basiniati matsiguenga aisati ingantaiguerĂ­ ora caninaro.", "metadata": { "languages": [ "tgl", @@ -6939,7 +6939,7 @@ { "type": "NarrativeText", "element_id": "a2d52f93737464a25abcd5d12c771b98", - "text": "Norwegian, Bokm\u00e5l Alle mennesker er f\u00f8dt frie og med samme menneskeverd og menneskerettigheter. De er utstyrt med fornuft og samvittighet og b\u00f8r handle mot hverandre i brorskapets \u00e5nd.", + "text": "Norwegian, BokmĂ¥l Alle mennesker er født frie og med samme menneskeverd og menneskerettigheter. De er utstyrt med fornuft og samvittighet og bør handle mot hverandre i brorskapets Ă¥nd.", "metadata": { "languages": [ "nor" @@ -6960,7 +6960,7 @@ { "type": "NarrativeText", "element_id": "0de9dab37169c4ded9b7f75bedf80c7f", - "text": "Norwegian, Nynorsk Alle menneske er f\u00f8dde til fridom og med same menneskeverd og menneskerettar. Dei har f\u00e5tt fornuft og samvit og skal leve med kvarandre som br\u00f8r.", + "text": "Norwegian, Nynorsk Alle menneske er fødde til fridom og med same menneskeverd og menneskerettar. Dei har fĂ¥tt fornuft og samvit og skal leve med kvarandre som brør.", "metadata": { "languages": [ "nor" @@ -7002,7 +7002,7 @@ { "type": "Title", "element_id": "dcfcf466590e9daa75e86df759c90a23", - "text": "\ua2bf\ua0b7\ua0c5\ua13f\ua428\ua425\uff0c\ua305\ua14d\ua002\ua3fd\ua42f\ua488\ua0c5\ua425\ua310\u3002\ua2bf\ua287\ua26a\ua346\ua30b\ua180\ua068\ua24c\ua44c\ua425\uff0c\ua137\ua00b\ua068\ua09b\ua2a8\ua16b\ua0c0\ua0c5\ua425\ua121\ua45f\u3002", + "text": "ê¿ê‚·êƒ…ê„¿ê¨ê¥ï¼ŒêŒ…ê…ꀂê½ê¯ê’ˆêƒ…ê¥êŒă€‚ê¿ê‡ê‰ªê†êŒ‹ê†€ê¨ê‰Œê‘Œê¥ï¼Œê„·ê€‹ê¨ê‚›ê¨ê…«êƒ€êƒ…ê¥ê„¡ê‘Ÿă€‚", "metadata": { "languages": [ "zho" @@ -7023,7 +7023,7 @@ { "type": "NarrativeText", "element_id": "68861af146d56db218a932271da013ea", - "text": "Nyamwezi Banhu bose bubyalagwa biyagalulile, n\u2019ikujo haki zilenganelile.", + "text": "Nyamwezi Banhu bose bubyalagwa biyagalulile, n’ikujo haki zilenganelile.", "metadata": { "languages": [ "swa" @@ -7129,7 +7129,7 @@ { "type": "NarrativeText", "element_id": "8bb5a449ca76c9652411df83a16d36a5", - "text": "Nzema Menli muala di b\u025b ti anwo na eza noko b\u025bs\u025b w\u0254 dibil\u025b nee adenlenyianl\u025b nu. B\u025bl\u025b ndwenlenwo nee adwenle, yem\u0254ti \u0254w\u0254 k\u025b b\u025bkile adiemay\u025bl\u025b b\u025bmaa b\u025b nwo ngoko.", + "text": "Nzema Menli muala di bÉ› ti anwo na eza noko bÉ›sÉ› wÉ” dibilÉ› nee adenlenyianlÉ› nu. BÉ›lÉ› ndwenlenwo nee adwenle, yemÉ”ti É”wÉ” kÉ› bÉ›kile adiemayÉ›lÉ› bÉ›maa bÉ› nwo ngoko.", "metadata": { "languages": [ "tur", @@ -7153,7 +7153,7 @@ { "type": "NarrativeText", "element_id": "945f5e12a8c939707776f2152604ea76", - "text": "Occitan T\u00f3uti lis uman naisson libre. Soun egau p\u00e8rla digneta e li dre. An t\u00f3uti uno resoun e uno counsci\u00e8nci. Se d\u00e8von tenifreirenau lis un 'm\u00e9 lis autre.", + "text": "Occitan TĂ³uti lis uman naisson libre. Soun egau pèrla digneta e li dre. An tĂ³uti uno resoun e uno counsciènci. Se dèvon tenifreirenau lis un 'mĂ© lis autre.", "metadata": { "languages": [ "fra", @@ -7175,7 +7175,7 @@ { "type": "NarrativeText", "element_id": "de85ed5a407a19c2c1c89211693d8861", - "text": "Occitan (Auvergnat) Ta la proussouna neisson lieura mo\u00e9 parira p\u00e0 d\u00efness\u00e0 mai dret. Son charjada de razou mo\u00e9 de cousiens\u00e0 mai lhu fau arj\u00ee entreme\u00ee lha bei n'eime de freiress\u00e0.", + "text": "Occitan (Auvergnat) Ta la proussouna neisson lieura moĂ© parira pĂ  dĂ¯nessĂ  mai dret. Son charjada de razou moĂ© de cousiensĂ  mai lhu fau arjĂ® entremeĂ® lha bei n'eime de freiressĂ .", "metadata": { "languages": [ "fra" @@ -7196,7 +7196,7 @@ { "type": "NarrativeText", "element_id": "6260219bc4a42037e7d6f0418b7284c5", - "text": "Occitan (Francoproven\u00e7al, Fribourg) Tot\u00e8 l\u00e8 dzin vinyon ou mondo libro \u00e8 par\u00ea in dinyit\u00e2 \u00e8 in dr\u00ea. Chon dot\u00e2 d\u00e8 r\u00e9jon \u00e8 d\u00e8 konhyinthe \u00e8 d\u00eavon ch\u00e8 konport\u00e2 l\u00e8 j\u2019on-l\u00e8 j\u2019\u00f4tro din on \u00e8chpri d\u00e8 frat\u00e8rnit\u00e2.", + "text": "Occitan (Francoprovençal, Fribourg) Totè lè dzin vinyon ou mondo libro è parĂª in dinyitĂ¢ è in drĂª. Chon dotĂ¢ dè rĂ©jon è dè konhyinthe è dĂªvon chè konportĂ¢ lè j’on-lè j’ôtro din on èchpri dè fratèrnitĂ¢.", "metadata": { "languages": [ "ita" @@ -7217,7 +7217,7 @@ { "type": "NarrativeText", "element_id": "b47382b7a0e0afd209aa7e1993565391", - "text": "Occitan (Francoproven\u00e7al, Savoie) Tu luz \u00f2m\u00f2 vinyon u mondo, libr\u00f2, tu t\u00f2ton p\u00e8 le\u00fb dinyit\u00f2 \u00e8 le\u00fb dr\u00e8ye. Y\u2019on tu d\u2019\u00e9m\u00f2 \u00e8 d\u00e8 konhyinhi \u00e8 i d\u00e8von f\u00e8- mouh\u00f2 d\u00e8 frat\u00e8rnit\u00f2 aou\u00e8y luz \u00f2tri.", + "text": "Occitan (Francoprovençal, Savoie) Tu luz Ă²mĂ² vinyon u mondo, librĂ², tu tĂ²ton pè leĂ» dinyitĂ² è leĂ» drèye. Y’on tu d’émĂ² è dè konhyinhi è i dèvon fè- mouhĂ² dè fratèrnitĂ² aouèy luz Ă²tri.", "metadata": { "languages": [ "ita", @@ -7239,7 +7239,7 @@ { "type": "NarrativeText", "element_id": "da6df9434bcea33fdb84c07309f23605", - "text": "Occitan (Francoproven\u00e7al, Valais) Tui l\u00e8 j\u00eatre humain n\u00e9chon libro \u00e8 pary in degnet\u00e2 \u00e9 in drou\u00ea. Chon reijon\u00e2bl\u00f3 \u00e8 d\u00e8 counchieince \u00e8 deivouon \u00e2zic l\u00e8 j\u2019oun vi j\u2019avi di j\u2019\u00e2tr\u00f3 in p\u00e8r oun espri d\u00e8 frat\u00e8rnit\u00e2", + "text": "Occitan (Francoprovençal, Valais) Tui lè jĂªtre humain nĂ©chon libro è pary in degnetĂ¢ Ă© in drouĂª. Chon reijonĂ¢blĂ³ è dè counchieince è deivouon Ă¢zic lè j’oun vi j’avi di jâ€™Ă¢trĂ³ in pèr oun espri dè fratèrnitĂ¢", "metadata": { "languages": [ "fra", @@ -7261,7 +7261,7 @@ { "type": "NarrativeText", "element_id": "4be88083cf737cac6ec1b39afb2513c5", - "text": "Occitan (Francoproven\u00e7al, Vaud) T\u00ee l\u00e8 z\u2019\u00eetre humain v\u00eegnant \u00e2o mondo libro et par\u00e2i dein la dignit\u00e2 et l\u00e8 dr\u00e2i. L\u2019ant re\u00e7u r\u00e9son et concheince et d\u00e2ivant vivre l\u00e8 z\u2019on avou\u00e9 l\u00e8 z\u2019autro quemet se sant fr\u00e2re et ch\u00e8ra.", + "text": "Occitan (Francoprovençal, Vaud) TĂ® lè z’ître humain vĂ®gnant Ă¢o mondo libro et parĂ¢i dein la dignitĂ¢ et lè drĂ¢i. L’ant reçu rĂ©son et concheince et dĂ¢ivant vivre lè z’on avouĂ© lè z’autro quemet se sant frĂ¢re et chèra.", "metadata": { "languages": [ "fra" @@ -7282,7 +7282,7 @@ { "type": "NarrativeText", "element_id": "ca97829bba2e332be352861c0d0e0c70", - "text": "Occitan (Languedocien) Totes los \u00e8ssers umans naisson liures e egals en dignitat e en dreches. Son dotats de rason e de consci\u00e9ncia e se devon comportar los unes amb los autres dins un esperit de fraternitat.", + "text": "Occitan (Languedocien) Totes los èssers umans naisson liures e egals en dignitat e en dreches. Son dotats de rason e de consciĂ©ncia e se devon comportar los unes amb los autres dins un esperit de fraternitat.", "metadata": { "languages": [ "cat", @@ -7305,7 +7305,7 @@ { "type": "NarrativeText", "element_id": "2c541386adb644071a67fa19c80d221f", - "text": "Ojibwa, Northwestern \u146d\u1472\u14c7\u140c\u14c0\u14d0 \u1472\u1431\u14aa\u144e\u14ef\u1417\u1466 \u14c2\u1455\u140e\u146d\u1417\u1483 \u144e\u142f\u14c2\u14a5\u144e\u14f1\u140e\u14c2\u1483 \u14a5\u14c7 \u1455\u1431\u1455 \u146d\u148b\u1403\u14c0\u1455\u146f\u14ef\u140e\u14d0 \u1472\u1526 \u144c\u1438\u146b\u1455\u146f\u14ef\u140e\u14d0. \u1405\u1455\u1526\u14c7\u1417 \u14a5\u1472\u140e\u140e\u14d0 \u1472\u1526 \u14c2\u1444\u1472\u140e\u14d0 \u14a5\u14c7\u1417 \u1455\u1525 \u148b\u1403\u1511\u1472\u14c7\u1417\u1438\u144e\u1417\u1438\u14d0 \u140a\u1490\u146f \u14a5\u14c4\u140e\u148b\u140e\u144e\u140e\u14c2\u1483.", + "text": "Ojibwa, Northwestern ᑭᑲᓇáŒá“€á“ ᑲá±á’ªá‘ᓯá—ᑦ á“‚á‘•áá‘­á—á’ƒ á‘á¯á“‚á’¥á‘ᓱáá“‚á’ƒ ᒥᓇ á‘•á±á‘• á‘­á’‹áƒá“€á‘•ᑯᓯáᓠᑲᔦ ᑌá¸á‘«á‘•ᑯᓯáá“. á…ᑕᔦᓇᗠᒥᑲááᓠᑲᔦ ᓂᑄᑲáᓠᒥᓇᗠᑕᔥ á’‹áƒá”‘ᑲᓇá—á¸á‘á—á¸á“ áá’ᑯ ᒥᓄáá’‹áá‘áá“‚á’ƒ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -7347,7 +7347,7 @@ { "type": "NarrativeText", "element_id": "838854e8c37bc2424bd4b8b4324da0a4", - "text": "Orok \u0427\u0438\u043f\u0430\u0304\u043b\u0438 \u0433\u0443\u0440\u0443\u043d\u043d\u0435\u0304 \u0431\u0430\u043b\u04e1\u0438\u0447\u0438 \u0433\u044d\u0432\u0443\u043c\u044d, \u043e\u043c\u043e\u0442\u0442\u043e \u043c\u044d\u0304\u043d\u044d \u043c\u04e9\u0440\u04e9\u043d\u04e1\u0438, \u043c\u044d\u0304\u043d\u044d \u0434\u043e\u0440\u043e\u043d\u04e1\u0438. \u041d\u043e\u0304\u0447\u0438 \u0438\u0434\u044d\u043b\u0443, \u0438\u0440\u043a\u0430\u043b\u0443, \u043c\u044d\u0304\u043d\u044d \u043c\u044d\u0304\u043d\u04e1\u0438 \u043d\u0430\u0304\u0434\u0430\u043a\u0442\u0430\u04c8\u0430\u0447\u0438 \u0431\u0458\u04e3\u0447\u0438.", + "text": "Orok Đ§Đ¸Đ¿Đ°̀„ли Đ³ÑƒÑ€ÑƒĐ½Đ½Đµ̀„ Đ±Đ°Đ»Ó¡Đ¸Ñ‡Đ¸ Đ³ÑĐ²ÑƒĐ¼Ñ, Đ¾Đ¼Đ¾Ñ‚Ñ‚Đ¾ Đ¼Ñ̀„Đ½Ñ Đ¼Ó©Ñ€Ó©Đ½Ó¡Đ¸, Đ¼Ñ̀„Đ½Ñ Đ´Đ¾Ñ€Đ¾Đ½Ó¡Đ¸. ĐĐ¾̀„Ñ‡Đ¸ идÑĐ»Ñƒ, Đ¸Ñ€ĐºĐ°Đ»Ñƒ, Đ¼Ñ̀„Đ½Ñ Đ¼Ñ̀„Đ½Ó¡Đ¸ Đ½Đ°̀„Đ´Đ°ĐºÑ‚Đ°ÓˆĐ°Ñ‡Đ¸ Đ±Ñ˜Ó£Ñ‡Đ¸.", "metadata": { "languages": [ "rus" @@ -7410,7 +7410,7 @@ { "type": "NarrativeText", "element_id": "61b9c386f4d7f982e217e8a0973deae9", - "text": "Osetin \u0410\u0434\u04d5\u0439\u043c\u04d5\u0433\u0442\u04d5 \u0441\u0435 '\u043f\u043f\u04d5\u0442 \u0434\u04d5\u0440 \u0440\u0430\u0439\u0433\u0443\u044b\u0440\u044b\u043d\u0446 \u0441\u04d5\u0440\u0438\u0431\u0430\u0440\u04d5\u0439 \u04d5\u043c\u04d5 \u04d5\u043c\u0445\u0443\u044b\u0437\u043e\u043d\u04d5\u0439 \u0441\u04d5 \u0431\u0430\u0440\u0442\u044b. \u0423\u044b\u0434\u043e\u043d \u04d5\u0445\u0445\u04d5\u0441\u0442 \u0441\u0442\u044b \u0437\u043e\u043d\u0434 \u04d5\u043c\u04d5 \u043d\u0430\u043c\u044b\u0441\u04d5\u0439, \u04d5\u043c\u04d5 \u043a\u04d5\u0440\u04d5\u0434\u0437\u0438\u0439\u04d5\u043d \u0445\u044a\u0443\u0430\u043c\u04d5 \u0443\u043e\u0439 \u04d5\u0444\u0441\u044b\u043c\u04d5\u0440\u0442\u044b \u0445\u0443\u044b\u0437\u04d5\u043d.", + "text": "Osetin ĐĐ´Ó•Đ¹Đ¼Ó•Đ³Ñ‚Ó• Ñе 'Đ¿Đ¿Ó•Ñ‚ Đ´Ó•Ñ€ Ñ€Đ°Đ¹Đ³ÑƒÑ‹Ñ€Ñ‹Đ½Ñ† ÑÓ•Ñ€Đ¸Đ±Đ°Ñ€Ó•Đ¹ Ó•Đ¼Ó• Ó•Đ¼Ñ…ÑƒÑ‹Đ·Đ¾Đ½Ó•Đ¹ ÑÓ• Đ±Đ°Ñ€Ñ‚Ñ‹. Đ£Ñ‹Đ´Đ¾Đ½ Ó•Ñ…Ñ…Ó•ÑÑ‚ Ñты Đ·Đ¾Đ½Đ´ Ó•Đ¼Ó• Đ½Đ°Đ¼Ñ‹ÑÓ•Đ¹, Ó•Đ¼Ó• ĐºÓ•Ñ€Ó•Đ´Đ·Đ¸Đ¹Ó•Đ½ Ñ…ÑÑƒĐ°Đ¼Ó• ÑƒĐ¾Đ¹ Ó•Ñ„ÑÑ‹Đ¼Ó•Ñ€Ñ‚Ñ‹ Ñ…ÑƒÑ‹Đ·Ó•Đ½.", "metadata": { "languages": [ "rus" @@ -7431,7 +7431,7 @@ { "type": "NarrativeText", "element_id": "f829c47775b5845587447d35b6b41e40", - "text": "Otomi, Mezquital Gotho nu kja'ni i mu\u0331i ra zoo i gotho ro kuchti, i tu'ni nu ro \u00f1a pad\u00e4 bini i da budi, da mu\u0331i ra zoo koyu gotho yu kja'ni i yo kuadi.", + "text": "Otomi, Mezquital Gotho nu kja'ni i mù±i ra zoo i gotho ro kuchti, i tu'ni nu ro ña padä bini i da budi, da mù±i ra zoo koyu gotho yu kja'ni i yo kuadi.", "metadata": { "languages": [ "hrv", @@ -7478,7 +7478,7 @@ { "type": "NarrativeText", "element_id": "dd2ab495e062b9a11fe24355a3c1319e", - "text": "P\u00e1ez Ya'nwe'wewa'te' maa nasapa ha'dacehk hi'pku up'hi', w\u00ebtte u'huwa'hi'pta', eena' eena' f'i'zewa' hi'pta', \u00fcus hi'pta' d'ik'the hi'pta' naapa'kate. Sa' h'ukaysa \u00fcus hi'pcehktha'w sa' pyakhna'we f'i'ze hi'ptha'w.", + "text": "PĂ¡ez Ya'nwe'wewa'te' maa nasapa ha'dacehk hi'pku up'hi', wĂ«tte u'huwa'hi'pta', eena' eena' f'i'zewa' hi'pta', Ă¼us hi'pta' d'ik'the hi'pta' naapa'kate. Sa' h'ukaysa Ă¼us hi'pcehktha'w sa' pyakhna'we f'i'ze hi'ptha'w.", "metadata": { "languages": [ "swa" @@ -7544,7 +7544,7 @@ { "type": "UncategorizedText", "element_id": "068d755c0e132506c2d31786a7ed4b32", - "text": "Panjabi, Eastern \u0a38\u0a3e\u0a30\u0a3e \u0a2e\u0a28\u0a41\u0a71\u0a16\u0a40 \u0a2a\u0a30\u0a3f\u0a35\u0a3e\u0a30 \u0a06\u0a2a\u0a23\u0a40 \u0a2e\u0a39\u0a3f\u0a2e\u0a3e, \u0a36\u0a3e\u0a28 \u0a05\u0a24\u0a47 \u0a39\u0a71\u0a15\u0a3e\u0a02 \u0a26\u0a47 \u0a2a\u0a71\u0a16\u0a4b\u0a02 \u0a1c\u0a28\u0a2e \u0a24\u0a4b\u0a02 \u0a39\u0a40 \u0a06\u0a5b\u0a3e\u0a26 \u0a39\u0a48 \u0a05\u0a24\u0a47 \u0a38\u0a41\u0a24\u0a47 \u0a38\u0a3f\u0a71\u0a27 \u0a38\u0a3e\u0a30\u0a47 \u0a32\u0a4b\u0a15 \u0a2c\u0a30\u0a3e\u0a2c\u0a30 \u0a39\u0a28 \u0964 \u0a09\u0a28\u0a4d\u0a39\u0a3e\u0a02 \u0a38\u0a2d\u0a28\u0a3e \u0a28\u0a42\u0a70 \u0a24\u0a30\u0a15 \u0a05\u0a24\u0a47 \u0a5b\u0a2e\u0a40\u0a30 \u0a26\u0a40 \u0a38\u0a4c\u0a17\u0a3e\u0a24 \u0a2e\u0a3f\u0a32\u0a40 \u0a39\u0a4b\u0a08 \u0a39\u0a48 \u0a05\u0a24\u0a47 \u0a09\u0a28\u0a4d\u0a39\u0a3e\u0a02 \u0a28\u0a42\u0a70 \u0a2d\u0a30\u0a3e\u0a24\u0a30\u0a40\u0a2d\u0a3e\u0a35 \u0a26\u0a40 \u0a2d\u0a3e\u0a35\u0a28\u0a3e \u0a30\u0a16\u0a26\u0a3f\u0a06\u0a02 \u0a06\u0a2a\u0a38 \u0a35\u0a3f\u0a1a \u0a35\u0a3f\u0a1a\u0a30\u0a23\u0a3e \u0a1a\u0a3e\u0a39\u0a40\u0a26\u0a3e \u0a39\u0a48 \u0964", + "text": "Panjabi, Eastern ਸਾਰਾ ਮਨà©à©±à¨–à©€ ਪਰਿਵਾਰ ਆਪਣੀ ਮਹਿਮਾ, ਸ਼ਾਨ ਅਤੇ ਹੱਕਾਂ ਦੇ ਪੱਖੋਂ ਜਨਮ ਤੋਂ ਹੀ ਆਜ਼ਾਦ ਹੈ ਅਤੇ ਸà©à¨¤à©‡ ਸਿੱਧ ਸਾਰੇ ਲੋਕ ਬਰਾਬਰ ਹਨ । ਉਨà©à¨¹à¨¾à¨‚ ਸਭਨਾ ਨੂੰ ਤਰਕ ਅਤੇ ਜ਼ਮੀਰ ਦੀ ਸੌਗਾਤ ਮਿਲੀ ਹੋਈ ਹੈ ਅਤੇ ਉਨà©à¨¹à¨¾à¨‚ ਨੂੰ ਭਰਾਤਰੀਭਾਵ ਦੀ ਭਾਵਨਾ ਰਖਦਿਆਂ ਆਪਸ ਵਿਠਵਿà¨à¨°à¨£à¨¾ à¨à¨¾à¨¹à©€à¨¦à¨¾ ਹੈ ।", "metadata": { "languages": [ "pan" @@ -7565,7 +7565,7 @@ { "type": "UncategorizedText", "element_id": "e81229801afdd767a6ca59c9877783bc", - "text": "Panjabi, Western \u0633\u0627\u0631\u06d2 \u0627\u0646\u0633\u0627\u0646 \u0622\u0632\u0627\u062f \u062a\u06d2 \u062d\u0642\u0648\u0642 \u062a\u06d2 \u0639\u0632\u062a \u062f\u06d2 \u0644\u062d\u0627\u0638 \u0646\u0627\u0644 \u0628\u0631\u0627\u0628\u0631 \u067e\u06cc\u062f\u0627 \u06c1\u0648\u0646\u062f\u06d2 \u0646\u06cc\u06ba \u06d4 \u06d4 \u0627\u0648\u06c1 \u0639\u0642\u0644 \u0633\u0645\u062c\u06be \u062a\u06d2 \u0686\u0646\u06af\u06d2 \u0645\u0646\u062f\u06d2 \u062f\u06cc \u067e\u0686\u06be\u0627\u0646 \u062a\u06d2 \u0627\u062d\u0633\u0627\u0633 \u0631\u06a9\u06be\u062f\u06d2 \u0646\u06d2 \u0627\u06cc\u0633 \u0648\u0627\u0633\u0637\u06d2 \u0627\u0648\u06c1\u0646\u0627\u06ba \u0646\u0648\u06ba \u0627\u06a9 \u062f\u0648\u062c\u06d2 \u0646\u0627\u0644 \u0628\u06be\u0627\u0626\u06cc \u0686\u0627\u0631\u06d2 \u0648\u0627\u0644\u0627 \u0633\u0644\u0648\u06a9 \u06a9\u0631\u0646\u0627 \u0686\u0627\u06c1\u06cc \u062f\u0627 \u0627\u06d2 \u06d4 \u06d4", + "text": "Panjabi, Western سارے انسان آزاد تے حقوق تے عزت دے لحاظ نال برابر پیدا Ûوندے نیں Û” Û” Ø§ÙˆÛ Ø¹Ù‚Ù„ سمجھ تے Ú†Ù†Ú¯Û’ مندے دی پچھان تے احساس رکھدے Ù†Û’ ایس واسطے اوÛناں نوں اک دوجے نال بھائی چارے والا سلوک کرنا چاÛÛŒ دا اے Û” Û”", "metadata": { "languages": [ "urd" @@ -7607,7 +7607,7 @@ { "type": "UncategorizedText", "element_id": "a2c1dda9330915ecdfba4af7c21da5c0", - "text": "Pashto, Northern \u062f \u0628\u0634\u0631 \u067c\u0648\u0644 \u0627\u0641\u0631\u0627\u062f \u0627\u0632\u0627\u062f \u0646\u0693\u06cd \u062a\u0647 \u0631\u0627\u0681\u064a \u0627\u0648 \u062f \u062d\u064a\u062b\u064a\u062a \u0627\u0648 \u062f \u062d\u0642\u0648\u0642\u0648 \u0644\u0647 \u067e\u0644\u0648\u0647 \u0633\u0631\u0647 \u0628\u0631\u0627\u0628\u0631 \u062f\u064a\u06d4 \u067c\u0648\u0644 \u062f \u0639\u0642\u0644 \u0627\u0648 \u0648\u062c\u062f\u0627\u0646 \u062e\u0627\u0648\u0646\u062f\u0627\u0646 \u062f\u064a \u0627\u0648 \u0628\u0627\u064a\u062f \u064a\u0648 \u0644\u0647 \u0628\u0644 \u0633\u0631\u0647 \u062f \u0648\u0631\u0648\u0631\u06cd \u067e\u0647 \u0631\u0648\u062d\u064a\u0647 \u0633\u0631\u0647 \u0686\u0644\u0646\u0646\u062f \u06a9\u0693\u064a\u06d4", + "text": "Pashto, Northern د بشر ټول Ø§ÙØ±Ø§Ø¯ ازاد Ù†Ú“Û ØªÙ‡ راÚ٠او د Ø­ÙØ«Ùت او د حقوقو له پلوه سره برابر دÙÛ” ټول د عقل او وجدان خاوندان د٠او Ø¨Ø§ÙØ¯ ÙÙˆ له بل سره د ÙˆØ±ÙˆØ±Û Ù¾Ù‡ روحÙÙ‡ سره چلنند Ú©Ú“ÙÛ”", "metadata": { "languages": [ "fas" @@ -7628,7 +7628,7 @@ { "type": "NarrativeText", "element_id": "7e9ad6a402b6252e85be01ffafa1eb5e", - "text": "Picard Tos l\u00e8s-omes vin\u00e8t \u00e5 monde l\u00eebes \u00e8t \u00e9g\u00e5ls po \u00e7ou qu'\u00e8st d' le\u00fb dignit\u00e9 \u00e8t d' le\u00fbs dre\u00fbts. Le\u00fb re\u030azon \u00e8t le\u00fb consyince elz\u00ee fe\u030at on d'vw\u00e9r di s'kid\u00fbre inte di z\u00e8le come d\u00e8s fr\u00e8s", + "text": "Picard Tos lès-omes vinèt Ă¥ monde lĂ®bes èt Ă©gĂ¥ls po çou qu'èst d' leĂ» dignitĂ© èt d' leĂ»s dreĂ»ts. LeĂ» rèzon èt leĂ» consyince elzĂ® fèt on d'vwĂ©r di s'kidĂ»re inte di zèle come dès frès", "metadata": { "languages": [ "fra" @@ -7757,7 +7757,7 @@ { "type": "NarrativeText", "element_id": "cad1fbc2c59a2ab610912476278d0204", - "text": "Polish Wszyscy ludzie rodz\u0105 si\u0119 wolni i r\u00f3wni pod wzgl\u0119dem swej godno\u015bci i swych praw. S\u0105 oni obdarzeni rozumem i sumieniem i powinni post\u0119powa\u0107 wobec innych w duchu braterstwa.", + "text": "Polish Wszyscy ludzie rodzÄ… siÄ™ wolni i rĂ³wni pod wzglÄ™dem swej godnoÅ›ci i swych praw. SÄ… oni obdarzeni rozumem i sumieniem i powinni postÄ™pować wobec innych w duchu braterstwa.", "metadata": { "languages": [ "pol" @@ -7778,7 +7778,7 @@ { "type": "NarrativeText", "element_id": "07022bc1c3bb5010208399375dc1b813", - "text": "Portuguese (Brazil) Todos os seres humanos nascem livres e iguais em dignidade e direitos. S\u00e3o dotados de raz\u00e3o e consci\u00eancia e devem agir em rela\u00e7\u00e3o uns aos outros com esp\u00edrito de fraternidade.", + "text": "Portuguese (Brazil) Todos os seres humanos nascem livres e iguais em dignidade e direitos. SĂ£o dotados de razĂ£o e consciĂªncia e devem agir em relaĂ§Ă£o uns aos outros com espĂ­rito de fraternidade.", "metadata": { "languages": [ "por" @@ -7799,7 +7799,7 @@ { "type": "NarrativeText", "element_id": "7925a3ec12f3766bebb236e3ec5bdc60", - "text": "Portuguese (Portugal) Todos os seres humanos nascem livres e iguais em dignidade e em direitos. Dotados de raz\u00e3o e de consci\u00eancia, devem agir uns para com os outros em esp\u00edrito de fraternidade.", + "text": "Portuguese (Portugal) Todos os seres humanos nascem livres e iguais em dignidade e em direitos. Dotados de razĂ£o e de consciĂªncia, devem agir uns para com os outros em espĂ­rito de fraternidade.", "metadata": { "languages": [ "por" @@ -7841,7 +7841,7 @@ { "type": "NarrativeText", "element_id": "dc4348bae7eccbd8e30af1763958fee9", - "text": "Pular (Adlam) \ud83a\udd0b\ud83a\udd32\ud83a\udd46\ud83a\udd22\ud83a\udd25\ud83a\udd22 \ud83a\udd22\ud83a\udd44\ud83a\udd23\ud83a\udd2b\ud83a\udd45\ud83a\udd36\ud83a\udd2d \ud83a\udd2c\ud83a\udd2e\ud83a\udd2c \ud83a\udd28\ud83a\udd2e\ud83a\udd3c\ud83a\udd2d\u060c \ud83a\udd32'\ud83a\udd23\ud83a\udd2d\ud83a\udd25\ud83a\udd2f\ud83a\udd2d\ud83a\udd23\ud83a\udd2d \ud83a\udd2b \ud83a\udd36\ud83a\udd2d\ud83a\udd26\ud83a\udd2d\ud83a\udd32\ud83a\udd22\ud83a\udd32\ud83a\udd46\ud83a\udd23\ud83a\udd2b \ud83a\udd3c\ud83a\udd2e \ud83a\udd26\ud83a\udd22\ud83a\udd32\ud83a\udd46\ud83a\udd3a\ud83a\udd2b \ud83a\udd38\ud83a\udd22\ud83a\udd33\ud83a\udd46\ud83a\udd2b\ud83a\udd45\ud83a\udd36\ud83a\udd2d. \ud83a\udd09\ud83a\udd29\ud83a\udd2b \ud83a\udd32'\ud83a\udd3a\ud83a\udd2e\ud83a\udd45\ud83a\udd23\ud83a\udd2d \ud83a\udd25\ud83a\udd2d\ud83a\udd45\ud83a\udd36\ud83a\udd2e \ud83a\udd2b \ud83a\udd38\ud83a\udd22\ud83a\udd33\ud83a\udd46\ud83a\udd2d\ud83a\udd24\ud83a\udd22\ud83a\udd32\ud83a\udd3c\ud83a\udd22\ud83a\udd44\ud83a\udd3a\ud83a\udd22\ud83a\udd24 \ud83a\udd2b\ud83a\udd3c\ud83a\udd2b \ud83a\udd2b\ud83a\udd29\ud83a\udd2b \ud83a\udd28\ud83a\udd2e\ud83a\udd3c\ud83a\udd2d \ud83a\udd38\ud83a\udd35\ud83a\udd45\ud83a\udd2c\ud83a\udd2e \ud83a\udd32'\ud83a\udd23\ud83a\udd2d\ud83a\udd2a\ud83a\udd23\ud83a\udd2b \ud83a\udd2b \ud83a\udd32'\ud83a\udd23\ud83a\udd2b\ud83a\udd2a \ud83a\udd29 \ud83a\udd2d\ud83a\udd34\ud83a\udd32\ud83a\udd3a\ud83a\udd35\ud83a\udd34\ud83a\udd35\ud83a\udd25\ud83a\udd46\ud83a\udd22\ud83a\udd44\ud83a\udd3a\ud83a\udd35.", + "text": "Pular (Adlam) đ¤‹đ¤²đ¥†đ¤¢đ¤¥đ¤¢ đ¤¢đ¥„đ¤£đ¤«đ¥…đ¤¶đ¤­ đ¤¬đ¤®đ¤¬ đ¤¨đ¤®đ¤¼đ¤­ØŒ đ¤²'đ¤£đ¤­đ¤¥đ¤¯đ¤­đ¤£đ¤­ 𤫠đ¤¶đ¤­đ¤¦đ¤­đ¤²đ¤¢đ¤²đ¥†đ¤£đ¤« đ¤¼đ¤® đ¤¦đ¤¢đ¤²đ¥†đ¤ºđ¤« đ¤¸đ¤¢đ¤³đ¥†đ¤«đ¥…đ¤¶đ¤­. đ¤‰đ¤©đ¤« đ¤²'đ¤ºđ¤®đ¥…đ¤£đ¤­ đ¤¥đ¤­đ¥…đ¤¶đ¤® 𤫠đ¤¸đ¤¢đ¤³đ¥†đ¤­đ¤¤đ¤¢đ¤²đ¤¼đ¤¢đ¥„đ¤ºđ¤¢đ¤¤ đ¤«đ¤¼đ¤« đ¤«đ¤©đ¤« đ¤¨đ¤®đ¤¼đ¤­ đ¤¸đ¤µđ¥…đ¤¬đ¤® đ¤²'đ¤£đ¤­đ¤ªđ¤£đ¤« 𤫠đ¤²'đ¤£đ¤«đ¤ª 𤩠đ¤­đ¤´đ¤²đ¤ºđ¤µđ¤´đ¤µđ¤¥đ¥†đ¤¢đ¥„đ¤ºđ¤µ.", "metadata": { "languages": [ "ara" @@ -7862,7 +7862,7 @@ { "type": "NarrativeText", "element_id": "9c7d0e713be2017eba040780765856df", - "text": "Purepecha Iamendu k'uiripuecha janguarhiparini ka majku jarhati ka jurhimbekuecha jingoni kueraa\u014basondikso ka, juajtakuarhis\u00efndiks\u00ef ambakiti eratsekua ka kaxumbikua, jatsistiks\u00ef eskaks\u00ef sesi arhijperaaka.", + "text": "Purepecha Iamendu k'uiripuecha janguarhiparini ka majku jarhati ka jurhimbekuecha jingoni kueraaÅ‹asondikso ka, juajtakuarhisĂ¯ndiksĂ¯ ambakiti eratsekua ka kaxumbikua, jatsistiksĂ¯ eskaksĂ¯ sesi arhijperaaka.", "metadata": { "languages": [ "est", @@ -7909,7 +7909,7 @@ { "type": "NarrativeText", "element_id": "e7cb3a61bb828a46ce008b4251df5ef3", - "text": "Quechua, Ambo-Pasco Lapan runa kay pachach'u yurin libri kawananpaq, lapanchinuy iwal respetasha kananpaqmi, mana pipis jarup\u00e4nanpaq, lapanpis iwal yarpach'akuy yach'aqmi, alita mana alita tantiyar kawananpaq. Chaynuy runa masinwan juknin jukninwan kuyanakur kap\u00e4kuchun", + "text": "Quechua, Ambo-Pasco Lapan runa kay pachach'u yurin libri kawananpaq, lapanchinuy iwal respetasha kananpaqmi, mana pipis jarupänanpaq, lapanpis iwal yarpach'akuy yach'aqmi, alita mana alita tantiyar kawananpaq. Chaynuy runa masinwan juknin jukninwan kuyanakur kapäkuchun", "metadata": { "languages": [ "tgl", @@ -7931,7 +7931,7 @@ { "type": "NarrativeText", "element_id": "7af8d8dd7e7418eed6057bb221448506", - "text": "Quechua, Arequipa-La Uni\u00f3n Kanmi derechonchiskuna llapanchispa, nacesqanchismanta. Kantaqmi llapanchispa runa kayninchis. Manan runa kanchu manay derechoyoq. Huk runaq derecho hukpawan kaqllan kan. Kanmi derechonchis llapanchispa allin kawsay libre tiyananchispaq. Llapan runaqpan kan yuyayninchis yachanapaq. Llapanchis kasun llapa runa masinchiskunawan munanakunapaq, huk ayllu hina.", + "text": "Quechua, Arequipa-La UniĂ³n Kanmi derechonchiskuna llapanchispa, nacesqanchismanta. Kantaqmi llapanchispa runa kayninchis. Manan runa kanchu manay derechoyoq. Huk runaq derecho hukpawan kaqllan kan. Kanmi derechonchis llapanchispa allin kawsay libre tiyananchispaq. Llapan runaqpan kan yuyayninchis yachanapaq. Llapanchis kasun llapa runa masinchiskunawan munanakunapaq, huk ayllu hina.", "metadata": { "languages": [ "tgl", @@ -8020,7 +8020,7 @@ { "type": "NarrativeText", "element_id": "7838a28da590ff7bb2ea5c7a48ba93fc", - "text": "Quechua, Huamal\u00edes-Dos de Mayo Hu\u00e1nuco Lapan runakunapis yurikuyan librimi y wakinkaqkunanaw rispitashqa, mana jarukushqa kay\u00e4nanpaq. Saynawmi runakunaqa yuriyan shumaq yarpayyuq, alitapis mana alitapis reqiykar y seqay kuyap\u00e4kuyyuq. Saymi runakuna ali kawakuy\u00e4nan jukninwan jukninwanpis.", + "text": "Quechua, HuamalĂ­es-Dos de Mayo HuĂ¡nuco Lapan runakunapis yurikuyan librimi y wakinkaqkunanaw rispitashqa, mana jarukushqa kayänanpaq. Saynawmi runakunaqa yuriyan shumaq yarpayyuq, alitapis mana alitapis reqiykar y seqay kuyapäkuyyuq. Saymi runakuna ali kawakuyänan jukninwan jukninwanpis.", "metadata": { "languages": [ "swa", @@ -8043,7 +8043,7 @@ { "type": "NarrativeText", "element_id": "08720fc9c770f44e38435bc27b49867d", - "text": "Quechua, Huaylas Ancash Meyqan nunapis manam pipa sirweqnin nuna kananpaqtsu yurikushqa. I nuna karninmi meyqan nunapis juk l\u00e1yatsu kayanman der\u00ebchunkunachowpis. I yarpachakiyta yacharninmi i allita mana allita shonqonkunachow m\u00e1kurninmi nunakuna jukninta wiyanakur kayanman.", + "text": "Quechua, Huaylas Ancash Meyqan nunapis manam pipa sirweqnin nuna kananpaqtsu yurikushqa. I nuna karninmi meyqan nunapis juk lĂ¡yatsu kayanman derĂ«chunkunachowpis. I yarpachakiyta yacharninmi i allita mana allita shonqonkunachow mĂ¡kurninmi nunakuna jukninta wiyanakur kayanman.", "metadata": { "languages": [ "ind", @@ -8065,7 +8065,7 @@ { "type": "NarrativeText", "element_id": "34a8df5528e399552e033b89176957b0", - "text": "Quechua, Margos-Yarowilca-Lauricocha Lapantsikunapis Iibrimi yurishqantsi. B\u00e4lintsimi y der\u00ebchuntsikunapis wakinkaqkunanoqlapami. Yarpaynintsikunapis kaykanmi runa mayintsikunawan juk wawqinoq kuyanakur kawap\u00e4kunantsipaq.", + "text": "Quechua, Margos-Yarowilca-Lauricocha Lapantsikunapis Iibrimi yurishqantsi. Bälintsimi y derĂ«chuntsikunapis wakinkaqkunanoqlapami. Yarpaynintsikunapis kaykanmi runa mayintsikunawan juk wawqinoq kuyanakur kawapäkunantsipaq.", "metadata": { "languages": [ "ind", @@ -8112,7 +8112,7 @@ { "type": "NarrativeText", "element_id": "ecc5d074ce9be67e187d19b4aabf87c5", - "text": "Quechua, North Jun\u00edn Lapan runas kay pachachru nasimun juk rantisha runanuy mana pitas sirbinanpaqmi, alipa rikasha kananpaqmi, washasha kananpaqmi. Lapan runakunas nasipaakamun yarpayniyoqmi naatan tantiyayniyoqmi ima lutanta rurapaakurursi tantiyakunanpaq. Lapan runakunas kawapaakunaman juk wawqenuylam.", + "text": "Quechua, North JunĂ­n Lapan runas kay pachachru nasimun juk rantisha runanuy mana pitas sirbinanpaqmi, alipa rikasha kananpaqmi, washasha kananpaqmi. Lapan runakunas nasipaakamun yarpayniyoqmi naatan tantiyayniyoqmi ima lutanta rurapaakurursi tantiyakunanpaq. Lapan runakunas kawapaakunaman juk wawqenuylam.", "metadata": { "languages": [ "tgl", @@ -8159,7 +8159,7 @@ { "type": "UncategorizedText", "element_id": "654791ed821f84e420d3742634a53e7c", - "text": "Quechua (Unified Quichua, old Hispanic orthography) Tucuy runacuna quishpirihu\u00e1n hui\u00f1\u00e1n, pactacunahuampes, pay pura, umahu\u00e1n, ayahu\u00e1n chay shucuna shina, chaymantami shuclla shina causangacuna.", + "text": "Quechua (Unified Quichua, old Hispanic orthography) Tucuy runacuna quishpirihuĂ¡n huiĂ±Ă¡n, pactacunahuampes, pay pura, umahuĂ¡n, ayahuĂ¡n chay shucuna shina, chaymantami shuclla shina causangacuna.", "metadata": { "languages": [ "spa", @@ -8204,7 +8204,7 @@ { "type": "NarrativeText", "element_id": "0f3dc8a63ddcf8d858d8e543a4eb8428", - "text": "Rarotongan Kua anau rangatira ia te tangata katoatoa ma te aiteite i te au tikaanga e te tu ngateitei tiratiratu.\u00a0 Kua ki ia ratou e te mero kimi ravenga e te akavangakau e kia akono tetai i tetai, i roto i te vaerua piri anga taeake.", + "text": "Rarotongan Kua anau rangatira ia te tangata katoatoa ma te aiteite i te au tikaanga e te tu ngateitei tiratiratu.  Kua ki ia ratou e te mero kimi ravenga e te akavangakau e kia akono tetai i tetai, i roto i te vaerua piri anga taeake.", "metadata": { "languages": [ "ind", @@ -8226,7 +8226,7 @@ { "type": "NarrativeText", "element_id": "f0f216272ee0f7e11e21eb4ca1752777", - "text": "Romagnolo Tot j ess\u00e8ri um\u00e8n i n\u00e0s l\u00e9bri e cumpagn in dignit\u00e0 e dir\u00e9t. Lou i \u00e8 dutid ad rasoun e ad cuscinza e i \u00e0 da oper\u00e8, ognun ti cunfrunt at ch'j ilt, sa sentimint ad fratel\u00e8nza.", + "text": "Romagnolo Tot j essèri umèn i nĂ s lĂ©bri e cumpagn in dignitĂ  e dirĂ©t. Lou i è dutid ad rasoun e ad cuscinza e i Ă  da operè, ognun ti cunfrunt at ch'j ilt, sa sentimint ad fratelènza.", "metadata": { "languages": [ "ita", @@ -8248,7 +8248,7 @@ { "type": "NarrativeText", "element_id": "a84b6ff398b4f815054e7b47107ce163", - "text": "Romani, Balkan Savorre manu\u015ba biand\u00f5n meste thaj barabar k-o demnipen aj k-e hakaja. Si len godi aj somzanipen thaj si len te tr\u0105den pen jekh karing o aver and-o vogi e phralimnasqoro.", + "text": "Romani, Balkan Savorre manuÅ›a biandõn meste thaj barabar k-o demnipen aj k-e hakaja. Si len godi aj somzanipen thaj si len te trÄ…den pen jekh karing o aver and-o vogi e phralimnasqoro.", "metadata": { "languages": [ "slv", @@ -8270,7 +8270,7 @@ { "type": "NarrativeText", "element_id": "dd72113ef6db4b69482adf28078a6090", - "text": "Romani, Balkan (1) Sa e manu\u0161ikane strukture bijand\u017eona tromane thaj jekhutne ko digniteti thaj \u010dapipa. Von si baxtarde em barvale gndaja thaj god\u017eaja thaj trubun jekh avereja te kherjakeren ko vod\u017ei pralipaja.", + "text": "Romani, Balkan (1) Sa e manuÅ¡ikane strukture bijandžona tromane thaj jekhutne ko digniteti thaj Äapipa. Von si baxtarde em barvale gndaja thaj godžaja thaj trubun jekh avereja te kherjakeren ko vodži pralipaja.", "metadata": { "languages": [ "slv" @@ -8291,7 +8291,7 @@ { "type": "NarrativeText", "element_id": "d1d78e5ce9c3fe2071093b3f74f8f9b8", - "text": "Romanian (1953) Toate fiin\u021bele umane se nasc libere \u0219i egale \u00een demnitate \u0219i \u00een drepturi. Ele s\u00eent \u00eenzestrate cu ra\u021biune \u0219i con\u0219tiin\u021b\u0103 \u0219i trebuie s\u0103 se comporte unele fa\u021b\u0103 de altele \u00een spiritul fraternit\u0103\u021bii.", + "text": "Romanian (1953) Toate fiinÈ›ele umane se nasc libere È™i egale Ă®n demnitate È™i Ă®n drepturi. Ele sĂ®nt Ă®nzestrate cu raÈ›iune È™i conÈ™tiință È™i trebuie să se comporte unele față de altele Ă®n spiritul fraternității.", "metadata": { "languages": [ "ron" @@ -8312,7 +8312,7 @@ { "type": "NarrativeText", "element_id": "ffd7f486f85cc12fffdee64c8dc1c47c", - "text": "Romanian (1993) Toate fiin\u021bele umane se nasc libere \u0219i egale \u00een demnitate \u0219i \u00een drepturi. Ele sunt \u00eenzestrate cu ra\u021biune \u0219i con\u0219tiin\u021b\u0103 \u0219i trebuie s\u0103 se comporte unele fa\u021b\u0103 de altele \u00een spiritul fraternit\u0103\u021bii.", + "text": "Romanian (1993) Toate fiinÈ›ele umane se nasc libere È™i egale Ă®n demnitate È™i Ă®n drepturi. Ele sunt Ă®nzestrate cu raÈ›iune È™i conÈ™tiință È™i trebuie să se comporte unele față de altele Ă®n spiritul fraternității.", "metadata": { "languages": [ "ron" @@ -8333,7 +8333,7 @@ { "type": "NarrativeText", "element_id": "81db31b50da57a040bad82d9af2297df", - "text": "Romanian (2006) Toate fiin\u021bele umane se nasc libere \u0219i egale \u00een demnitate \u0219i \u00een drepturi. Ele sunt \u00eenzestrate cu ra\u021biune \u0219i con\u0219tiin\u021b\u0103 \u0219i trebuie s\u0103 se comporte unele fa\u021b\u0103 de altele \u00een spiritul fraternit\u0103\u021bii.", + "text": "Romanian (2006) Toate fiinÈ›ele umane se nasc libere È™i egale Ă®n demnitate È™i Ă®n drepturi. Ele sunt Ă®nzestrate cu raÈ›iune È™i conÈ™tiință È™i trebuie să se comporte unele față de altele Ă®n spiritul fraternității.", "metadata": { "languages": [ "ron" @@ -8354,7 +8354,7 @@ { "type": "NarrativeText", "element_id": "cadc80db78bd586f5f18217272cfdb17", - "text": "Romansch Tuots umans naschan libers ed eguals in dignit\u00e0 e drets. Els sun dotats cun intellet e conscienza e dessan agir tanter per in uin spiert da fraternit\u00e0.", + "text": "Romansch Tuots umans naschan libers ed eguals in dignitĂ  e drets. Els sun dotats cun intellet e conscienza e dessan agir tanter per in uin spiert da fraternitĂ .", "metadata": { "languages": [ "cat", @@ -8376,7 +8376,7 @@ { "type": "NarrativeText", "element_id": "4295c14118d555a1bd3be37701a4578e", - "text": "Romansch (Grischun) Tut ils umans naschan libers ed eguals en dignitad ed en dretgs. Els \u00e8n dotads cun raschun e conscienza e duain agir in vers l\u2019auter en spiert da fraternitad.", + "text": "Romansch (Grischun) Tut ils umans naschan libers ed eguals en dignitad ed en dretgs. Els èn dotads cun raschun e conscienza e duain agir in vers l’auter en spiert da fraternitad.", "metadata": { "languages": [ "deu", @@ -8398,7 +8398,7 @@ { "type": "NarrativeText", "element_id": "d7c3646cc8bf5af91fa007bcdc86ad53", - "text": "Romansch (Puter) Tuot ils umauns naschan libers ed eguels in dignited ed in drets. Els sun dotos cun radschun e conscienza e dessan agir \u00fcn invers l\u2019oter in spiert da fraternited.", + "text": "Romansch (Puter) Tuot ils umauns naschan libers ed eguels in dignited ed in drets. Els sun dotos cun radschun e conscienza e dessan agir Ă¼n invers l’oter in spiert da fraternited.", "metadata": { "languages": [ "deu", @@ -8421,7 +8421,7 @@ { "type": "NarrativeText", "element_id": "a0daace15fe9f49d73fcdd9e3b86f001", - "text": "Romansch (Surmiran) Tot igls carstgangs neschan libers ed eguals an dignitad ed an dretgs. Els \u00e8n dotos cun raschung e schientscha e duessan ager l\u2019egn vers l\u2019oter an spiert da fraternitad.", + "text": "Romansch (Surmiran) Tot igls carstgangs neschan libers ed eguals an dignitad ed an dretgs. Els èn dotos cun raschung e schientscha e duessan ager l’egn vers l’oter an spiert da fraternitad.", "metadata": { "languages": [ "cat", @@ -8444,7 +8444,7 @@ { "type": "NarrativeText", "element_id": "57126ecde8022743581d3932507d8b63", - "text": "Romansch (Sursilvan) Tut ils humans neschan libers ed eguals en dignitad ed en dretgs. Els ein dotai cun raschun e cunscienzia e duein agir in viers l\u2019auter en sp\u00e9rt da fraternitad.", + "text": "Romansch (Sursilvan) Tut ils humans neschan libers ed eguals en dignitad ed en dretgs. Els ein dotai cun raschun e cunscienzia e duein agir in viers l’auter en spĂ©rt da fraternitad.", "metadata": { "languages": [ "deu", @@ -8469,7 +8469,7 @@ { "type": "NarrativeText", "element_id": "82fb166f28096b77e6b865ce44135e16", - "text": "Romansch (Sutsilvan) Tut igls humans neschan libers ad eguals an dignitad ad an dretgs. Els en dotos cun rasch\u00f9n a cunzienzia a den agir egn anviers l\u2019oter an spiert da fraternitad.", + "text": "Romansch (Sutsilvan) Tut igls humans neschan libers ad eguals an dignitad ad an dretgs. Els en dotos cun raschĂ¹n a cunzienzia a den agir egn anviers l’oter an spiert da fraternitad.", "metadata": { "languages": [ "cat", @@ -8491,7 +8491,7 @@ { "type": "NarrativeText", "element_id": "53246b60d8dbe52f7f323cfe27507738", - "text": "Romansch (Vallader) Tuot ils umans naschan libers ed eguals in dignit\u00e0 ed in drets. Els sun dotats cun radschun e conscienza e dessan agir \u00fcn invers l\u2019oter in \u00fcn spiert da fraternit\u00e0.", + "text": "Romansch (Vallader) Tuot ils umans naschan libers ed eguals in dignitĂ  ed in drets. Els sun dotats cun radschun e conscienza e dessan agir Ă¼n invers l’oter in Ă¼n spiert da fraternitĂ .", "metadata": { "languages": [ "cat", @@ -8534,7 +8534,7 @@ { "type": "NarrativeText", "element_id": "7b1fe5da3cfa2322dd960a870a966d3a", - "text": "Russian \u0412\u0441\u0435 \u043b\u044e\u0434\u0438 \u0440\u043e\u0436\u0434\u0430\u044e\u0442\u0441\u044f \u0441\u0432\u043e\u0431\u043e\u0434\u043d\u044b\u043c\u0438 \u0438 \u0440\u0430\u0432\u043d\u044b\u043c\u0438 \u0432 \u0441\u0432\u043e\u0435\u043c \u0434\u043e\u0441\u0442\u043e\u0438\u043d\u0441\u0442\u0432\u0435 \u0438 \u043f\u0440\u0430\u0432\u0430\u0445. \u041e\u043d\u0438 \u043d\u0430\u0434\u0435\u043b\u0435\u043d\u044b \u0440\u0430\u0437\u0443\u043c\u043e\u043c \u0438 \u0441\u043e\u0432\u0435\u0441\u0442\u044c\u044e \u0438 \u0434\u043e\u043b\u0436\u043d\u044b \u043f\u043e\u0441\u0442\u0443\u043f\u0430\u0442\u044c \u0432 \u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u0438 \u0434\u0440\u0443\u0433 \u0434\u0440\u0443\u0433\u0430 \u0432 \u0434\u0443\u0445\u0435 \u0431\u0440\u0430\u0442\u0441\u0442\u0432\u0430.", + "text": "Russian Đ’Ñе Đ»Ñди Ñ€Đ¾Đ¶Đ´Đ°ÑÑ‚ÑÑ ÑĐ²Đ¾Đ±Đ¾Đ´Đ½Ñ‹Đ¼Đ¸ и Ñ€Đ°Đ²Đ½Ñ‹Đ¼Đ¸ Đ² ÑĐ²Đ¾ĐµĐ¼ Đ´Đ¾ÑÑ‚Đ¾Đ¸Đ½ÑÑ‚Đ²Đµ и Đ¿Ñ€Đ°Đ²Đ°Ñ…. ĐĐ½Đ¸ Đ½Đ°Đ´ĐµĐ»ĐµĐ½Ñ‹ Ñ€Đ°Đ·ÑƒĐ¼Đ¾Đ¼ и ÑĐ¾Đ²ĐµÑÑ‚ÑŒÑ Đ¸ Đ´Đ¾Đ»Đ¶Đ½Ñ‹ Đ¿Đ¾ÑÑ‚ÑƒĐ¿Đ°Ñ‚ÑŒ Đ² Đ¾Ñ‚Đ½Đ¾ÑˆĐµĐ½Đ¸Đ¸ Đ´Ñ€ÑƒĐ³ Đ´Ñ€ÑƒĐ³Đ° Đ² Đ´ÑƒÑ…Đµ Đ±Ñ€Đ°Ñ‚ÑÑ‚Đ²Đ°.", "metadata": { "languages": [ "rus" @@ -8576,7 +8576,7 @@ { "type": "NarrativeText", "element_id": "48332b010fe58bc794e833308da30575", - "text": "Saami, North Buot olbmot leat rieg\u00e1dan friddjan ja olmmo\u0161\u00e1rvvu ja olmmo\u0161vuoigatvuo\u0111aid d\u00e1fus. Sii leat jierbmala\u0161 olbmot geain lea oamedovdu ja sii g\u00e1lgga\u0161e leat dego vielja\u010dagat.", + "text": "Saami, North Buot olbmot leat riegĂ¡dan friddjan ja olmmoÅ¡Ă¡rvvu ja olmmoÅ¡vuoigatvuoÄ‘aid dĂ¡fus. Sii leat jierbmalaÅ¡ olbmot geain lea oamedovdu ja sii gĂ¡lggaÅ¡e leat dego vieljaÄagat.", "metadata": { "languages": [ "est", @@ -8598,7 +8598,7 @@ { "type": "UncategorizedText", "element_id": "373656c2cab80370dd2768316c8a725e", - "text": "Salar Heme kishler h\u00fcr der, haysiyet ma haklarde adil der, mantik ma vicdan var, kardeshlikden davraneshge.", + "text": "Salar Heme kishler hĂ¼r der, haysiyet ma haklarde adil der, mantik ma vicdan var, kardeshlikden davraneshge.", "metadata": { "languages": [ "tur" @@ -8642,7 +8642,7 @@ { "type": "NarrativeText", "element_id": "ddfa143fc42a89f1e4f7b99ce0028962", - "text": "Sango Ad\u00fc \u00e2zo k\u00fb\u00ea yamba, ng\u00e2 \u00e2la l\u00eengbi ter\u00ea na l\u00eag\u00eb t\u00ee n\u00ebng\u00f6-ter\u00ea na t\u00ee \u00e2ngang\u00fc. Ala k\u00fb\u00ea awara ndar\u00e4 na b\u00f6r\u00f6-li s\u00ef \u00e2la l\u00eengbi t\u00ee dut\u00ef na \u00e2mb\u00e2 t\u00ee \u00e2la g\u00ef na l\u00eang\u00f6 s\u00f6ng\u00f6.", + "text": "Sango AdĂ¼ Ă¢zo kĂ»Ăª yamba, ngĂ¢ Ă¢la lĂ®ngbi terĂª na lĂªgĂ« tĂ® nĂ«ngö-terĂª na tĂ® Ă¢ngangĂ¼. Ala kĂ»Ăª awara ndarä na börö-li sĂ¯ Ă¢la lĂ®ngbi tĂ® dutĂ¯ na Ă¢mbĂ¢ tĂ® Ă¢la gĂ¯ na lĂªngö söngö.", "metadata": { "languages": [ "tgl", @@ -8664,7 +8664,7 @@ { "type": "UncategorizedText", "element_id": "ba8456690a521bd0fb0bb757c188f302", - "text": "Sanskrit \u0938\u0930\u094d\u0935\u0947 \u092e\u093e\u0928\u0935\u093e\u0903 \u0938\u094d\u0935\u0924\u0928\u094d\u0924\u094d\u0930\u093e\u0903 \u0938\u092e\u0941\u0924\u094d\u092a\u0928\u094d\u0928\u093e\u0903 \u0935\u0930\u094d\u0924\u0928\u094d\u0924\u0947 \u0905\u092a\u093f \u091a, \u0917\u094c\u0930\u0935\u0926\u0943\u0936\u093e \u0905\u0927\u093f\u0915\u093e\u0930\u0926\u0943\u0936\u093e \u091a \u0938\u092e\u093e\u0928\u093e\u0903 \u090f\u0935 \u0935\u0930\u094d\u0924\u0928\u094d\u0924\u0947\u0964 \u090f\u0924\u0947 \u0938\u0930\u094d\u0935\u0947 \u091a\u0947\u0924\u0928\u093e-\u0924\u0930\u094d\u0915-\u0936\u0915\u094d\u0924\u093f\u092d\u094d\u092f\u093e\u0902 \u0938\u0941\u0938\u092e\u094d\u092a\u0928\u094d\u0928\u093e\u0903 \u0938\u0928\u094d\u0924\u093f\u0964 \u0905\u092a\u093f \u091a, \u0938\u0930\u094d\u0935\u0947\u093d\u092a\u093f \u092c\u0928\u094d\u0927\u0941\u0924\u094d\u0935-\u092d\u093e\u0935\u0928\u092f\u093e \u092a\u0930\u0938\u094d\u092a\u0930\u0902 \u0935\u094d\u092f\u0935\u0939\u0930\u0928\u094d\u0924\u0941\u0964", + "text": "Sanskrit सरà¥à¤µà¥‡ मानवाः सà¥à¤µà¤¤à¤¨à¥à¤¤à¥à¤°à¤¾à¤ƒ समà¥à¤¤à¥à¤ªà¤¨à¥à¤¨à¤¾à¤ƒ वरà¥à¤¤à¤¨à¥à¤¤à¥‡ अपि à¤, गौरवदृशा अधिकारदृशा ठसमानाः à¤à¤µ वरà¥à¤¤à¤¨à¥à¤¤à¥‡à¥¤ à¤à¤¤à¥‡ सरà¥à¤µà¥‡ à¤à¥‡à¤¤à¤¨à¤¾-तरà¥à¤•-शकà¥à¤¤à¤¿à¤­à¥à¤¯à¤¾à¤‚ सà¥à¤¸à¤®à¥à¤ªà¤¨à¥à¤¨à¤¾à¤ƒ सनà¥à¤¤à¤¿à¥¤ अपि à¤, सरà¥à¤µà¥‡à¤½à¤ªà¤¿ बनà¥à¤§à¥à¤¤à¥à¤µ-भावनया परसà¥à¤ªà¤°à¤‚ वà¥à¤¯à¤µà¤¹à¤°à¤¨à¥à¤¤à¥à¥¤", "metadata": { "languages": [ "hin" @@ -8685,7 +8685,7 @@ { "type": "NarrativeText", "element_id": "7013f596e8a99afdd7965ac753815ad9", - "text": "Sanskrit (Grantha) \ud804\udf38\ud804\udf30\ud804\udf4d\ud804\udf35\ud804\udf47 \ud804\udf2e\ud804\udf3e\ud804\udf28\ud804\udf35\ud804\udf3e\ud804\udf03 \ud804\udf38\ud804\udf4d\ud804\udf35\ud804\udf24\ud804\udf28\ud804\udf4d\ud804\udf24\ud804\udf4d\ud804\udf30\ud804\udf3e\ud804\udf03 \ud804\udf38\ud804\udf2e\ud804\udf41\ud804\udf24\ud804\udf4d\ud804\udf2a\ud804\udf28\ud804\udf4d\ud804\udf28\ud804\udf3e\ud804\udf03 \ud804\udf35\ud804\udf30\ud804\udf4d\ud804\udf24\ud804\udf28\ud804\udf4d\ud804\udf24\ud804\udf47 \ud804\udf05\ud804\udf2a\ud804\udf3f \ud804\udf1a, \ud804\udf17\ud804\udf4c\ud804\udf30\ud804\udf35\ud804\udf26\ud804\udf43\ud804\udf36\ud804\udf3e \ud804\udf05\ud804\udf27\ud804\udf3f\ud804\udf15\ud804\udf3e\ud804\udf30\ud804\udf26\ud804\udf43\ud804\udf36\ud804\udf3e \ud804\udf1a \ud804\udf38\ud804\udf2e\ud804\udf3e\ud804\udf28\ud804\udf3e\ud804\udf03 \ud804\udf0f\ud804\udf35 \ud804\udf35\ud804\udf30\ud804\udf4d\ud804\udf24\ud804\udf28\ud804\udf4d\ud804\udf24\ud804\udf47\u0964 \ud804\udf0f\ud804\udf24\ud804\udf47 \ud804\udf38\ud804\udf30\ud804\udf4d\ud804\udf35\ud804\udf47 \ud804\udf1a\ud804\udf47\ud804\udf24\ud804\udf28\ud804\udf3e-\ud804\udf24\ud804\udf30\ud804\udf4d\ud804\udf15-\ud804\udf36\ud804\udf15\ud804\udf4d\ud804\udf24\ud804\udf3f\ud804\udf2d\ud804\udf4d\ud804\udf2f\ud804\udf3e\ud804\udf02 \ud804\udf38\ud804\udf41\ud804\udf38\ud804\udf2e\ud804\udf4d\ud804\udf2a\ud804\udf28\ud804\udf4d\ud804\udf28\ud804\udf3e\ud804\udf03 \ud804\udf38\ud804\udf28\ud804\udf4d\ud804\udf24\ud804\udf3f\u0964 \ud804\udf05\ud804\udf2a\ud804\udf3f \ud804\udf1a, \ud804\udf38\ud804\udf30\ud804\udf4d\ud804\udf35\ud804\udf47\ud804\udf3d\ud804\udf2a\ud804\udf3f \ud804\udf2c\ud804\udf28\ud804\udf4d\ud804\udf27\ud804\udf41\ud804\udf24\ud804\udf4d\ud804\udf35-\ud804\udf2d\ud804\udf3e\ud804\udf35\ud804\udf28\ud804\udf2f\ud804\udf3e \ud804\udf2a\ud804\udf30\ud804\udf38\ud804\udf4d\ud804\udf2a\ud804\udf30\ud804\udf02 \ud804\udf35\ud804\udf4d\ud804\udf2f\ud804\udf35\ud804\udf39\ud804\udf30\ud804\udf28\ud804\udf4d\ud804\udf24\ud804\udf41\u0964", + "text": "Sanskrit (Grantha) đ‘Œ¸đ‘Œ°đ‘đ‘Œµđ‘‡ đ‘Œ®đ‘Œ¾đ‘Œ¨đ‘Œµđ‘Œ¾đ‘Œƒ đ‘Œ¸đ‘đ‘Œµđ‘Œ¤đ‘Œ¨đ‘đ‘Œ¤đ‘đ‘Œ°đ‘Œ¾đ‘Œƒ đ‘Œ¸đ‘Œ®đ‘đ‘Œ¤đ‘đ‘Œªđ‘Œ¨đ‘đ‘Œ¨đ‘Œ¾đ‘Œƒ đ‘Œµđ‘Œ°đ‘đ‘Œ¤đ‘Œ¨đ‘đ‘Œ¤đ‘‡ đ‘Œ…đ‘Œªđ‘Œ¿ đ‘Œ, đ‘Œ—đ‘Œđ‘Œ°đ‘Œµđ‘Œ¦đ‘ƒđ‘Œ¶đ‘Œ¾ đ‘Œ…đ‘Œ§đ‘Œ¿đ‘Œ•đ‘Œ¾đ‘Œ°đ‘Œ¦đ‘ƒđ‘Œ¶đ‘Œ¾ đ‘Œ đ‘Œ¸đ‘Œ®đ‘Œ¾đ‘Œ¨đ‘Œ¾đ‘Œƒ đ‘Œđ‘Œµ đ‘Œµđ‘Œ°đ‘đ‘Œ¤đ‘Œ¨đ‘đ‘Œ¤đ‘‡à¥¤ đ‘Œđ‘Œ¤đ‘‡ đ‘Œ¸đ‘Œ°đ‘đ‘Œµđ‘‡ đ‘Œđ‘‡đ‘Œ¤đ‘Œ¨đ‘Œ¾-đ‘Œ¤đ‘Œ°đ‘đ‘Œ•-đ‘Œ¶đ‘Œ•đ‘đ‘Œ¤đ‘Œ¿đ‘Œ­đ‘đ‘Œ¯đ‘Œ¾đ‘Œ‚ đ‘Œ¸đ‘đ‘Œ¸đ‘Œ®đ‘đ‘Œªđ‘Œ¨đ‘đ‘Œ¨đ‘Œ¾đ‘Œƒ đ‘Œ¸đ‘Œ¨đ‘đ‘Œ¤đ‘Œ¿à¥¤ đ‘Œ…đ‘Œªđ‘Œ¿ đ‘Œ, đ‘Œ¸đ‘Œ°đ‘đ‘Œµđ‘‡đ‘Œ½đ‘Œªđ‘Œ¿ đ‘Œ¬đ‘Œ¨đ‘đ‘Œ§đ‘đ‘Œ¤đ‘đ‘Œµ-đ‘Œ­đ‘Œ¾đ‘Œµđ‘Œ¨đ‘Œ¯đ‘Œ¾ đ‘Œªđ‘Œ°đ‘Œ¸đ‘đ‘Œªđ‘Œ°đ‘Œ‚ đ‘Œµđ‘đ‘Œ¯đ‘Œµđ‘Œ¹đ‘Œ°đ‘Œ¨đ‘đ‘Œ¤đ‘।", "metadata": { "languages": [ "nep" @@ -8706,7 +8706,7 @@ { "type": "NarrativeText", "element_id": "d9dd825f97644f9be308505d418e9ea9", - "text": "S\u00e3otomense Tudu ngu\u00ea di mundu ca nanc\u00ea livli e igual ni dignidade e ni dir\u00eatu. Punda nen ca pens\u00e1 e nen t\u00ea cunxensa, sel\u00e1 nen f\u00e9 tudu cu\u00e1 cu ten\u00e7\u00f3n de lum\u00f3n.", + "text": "SĂ£otomense Tudu nguĂª di mundu ca nancĂª livli e igual ni dignidade e ni dirĂªtu. Punda nen ca pensĂ¡ e nen tĂª cunxensa, selĂ¡ nen fĂ© tudu cuĂ¡ cu tenĂ§Ă³n de lumĂ³n.", "metadata": { "languages": [ "por", @@ -8728,7 +8728,7 @@ { "type": "NarrativeText", "element_id": "ea94e46fedb24cbbc337bb5d30608ead", - "text": "Sardinian, Logudorese Totu sos \u00e8sseres umanos naschint l\u00ecberos e eguales in dinnidade e in deretos. Issos tenent sa resone e sa cuss\u00e8ntzia e depent operare s'unu cun s'\u00e0teru cun isp\u00ecritu de fraternidade.", + "text": "Sardinian, Logudorese Totu sos èsseres umanos naschint lìberos e eguales in dinnidade e in deretos. Issos tenent sa resone e sa cussèntzia e depent operare s'unu cun s'Ă teru cun ispìritu de fraternidade.", "metadata": { "languages": [ "cat", @@ -8750,7 +8750,7 @@ { "type": "NarrativeText", "element_id": "135f949e79e915feb11563f40072624d", - "text": "Saxon, Low All de Minschen s\u00fcnd frie un gliek an W\u00fc\u00fcrd un Rechten baren. Se hebbt Vernunft un een Geweten un se sch\u00fcllt sik Br\u00f6der sien.", + "text": "Saxon, Low All de Minschen sĂ¼nd frie un gliek an WĂ¼Ă¼rd un Rechten baren. Se hebbt Vernunft un een Geweten un se schĂ¼llt sik Bröder sien.", "metadata": { "languages": [ "deu" @@ -8792,7 +8792,7 @@ { "type": "NarrativeText", "element_id": "49685f2659217462214b13c3594d1423", - "text": "Secoya Si'apai aide'oy\u00eb kua'ye peoye kui'ne siay\u00eb'k\u00eb maka pa'iye kui'ne tutupaye koni, jaje kuasase's\u00ebtepi kuaju'i'ne peoye \u00f1ese saiye pa'iji ko\u0331kaij\u00eb yek\u00eb paireje.", + "text": "Secoya Si'apai aide'oyĂ« kua'ye peoye kui'ne siayĂ«'kĂ« maka pa'iye kui'ne tutupaye koni, jaje kuasase'sĂ«tepi kuaju'i'ne peoye ñese saiye pa'iji kò±kaijĂ« yekĂ« paireje.", "metadata": { "languages": [ "sqi", @@ -8814,7 +8814,7 @@ { "type": "UncategorizedText", "element_id": "e0ca8f739a2a274e0e30bcd509b308e2", - "text": "Seraiki \u0633\u0627\u0631\u06d2 \u0627\u0646\u0633\u0627\u0646 \u0627\u0632\u0627\u062f\u0627 \u062a\u06d2 \u062d\u0642\u0648\u0642 \u062a\u06d2 \u0639\u0632\u062a \u062f\u06d2 \u0627\u0639\u062a\u0628\u0627\u0631 \u0646\u0627\u0644 \u06c1\u06a9\u0648 \u0684\u0626\u06d2 \u067e\u06cc\u062f\u0627 \u062a\u06be\u06cc\u0646\u062f\u0646 \u06d4 \u0642\u062f\u0631\u062a \u0648\u0644\u0648\u06ba \u0627\u0646\u06c1\u0627\u06ba \u06a9\u0648\u06ba \u0639\u0642\u0644 \u062a\u06d2 \u0633\u0645\u062c\u06be \u0639\u0637\u0627 \u062a\u06be\u06cc\u0646\u062f\u06cc \u0627\u06d2 \u06d4 \u06c1\u06cc\u06ba \u06a9\u06cc\u062a\u06d2 \u06c1\u06a9 \u068b\u0648\u062c\u06be\u06d2 \u0646\u0627\u0644 \u0628\u06be\u0631\u067e\u06cc \u062f\u0627\u0633\u0644\u0648\u06a9 \u06a9\u0631\u06bb\u0627 \u0686\u0627\u06c1\u06cc \u062f\u0627 \u0627\u06d2 \u06d4", + "text": "Seraiki سارے انسان ازادا تے حقوق تے عزت دے اعتبار نال ÛÚ©Ùˆ ڄئے پیدا تھیندن Û” قدرت ولوں Ø§Ù†ÛØ§Úº Ú©ÙˆÚº عقل تے سمجھ عطا تھیندی اے Û” Ûیں کیتے ÛÚ© ڋوجھے نال بھرپی داسلوک کرڻا چاÛÛŒ دا اے Û”", "metadata": { "languages": [ "urd" @@ -8835,7 +8835,7 @@ { "type": "NarrativeText", "element_id": "f855b701f2717951ee7041f505936e9e", - "text": "Serbian (Cyrillic) \u0421\u0432\u0430 \u0459\u0443\u0434\u0441\u043a\u0430 \u0431\u0438\u045b\u0430 \u0440\u0430\u0452\u0430\u0458\u0443 \u0441\u0435 \u0441\u043b\u043e\u0431\u043e\u0434\u043d\u0430 \u0438 \u0458\u0435\u0434\u043d\u0430\u043a\u0430 \u0443 \u0434\u043e\u0441\u0442\u043e\u0458\u0430\u043d\u0441\u0442\u0432\u0443 \u0438 \u043f\u0440\u0430\u0432\u0438\u043c\u0430. \u041e\u043d\u0430 \u0441\u0443 \u043e\u0431\u0434\u0430\u0440\u0435\u043d\u0430 \u0440\u0430\u0437\u0443\u043c\u043e\u043c \u0438 \u0441\u0432\u0435\u0448\u045b\u0443 \u0438 \u0442\u0440\u0435\u0431\u0430 \u0458\u0435\u0434\u043d\u0438 \u043f\u0440\u0435\u043c\u0430 \u0434\u0440\u0443\u0433\u0438\u043c\u0430 \u0434\u0430 \u043f\u043e\u0441\u0442\u0443\u043f\u0430\u0458\u0443 \u0443 \u0434\u0443\u0445\u0443 \u0431\u0440\u0430\u0442\u0441\u0442\u0432\u0430.", + "text": "Serbian (Cyrillic) Đ¡Đ²Đ° Ñ™ÑƒĐ´ÑĐºĐ° Đ±Đ¸Ñ›Đ° Ñ€Đ°Ñ’Đ°Ñ˜Ñƒ Ñе ÑĐ»Đ¾Đ±Đ¾Đ´Đ½Đ° и Ñ˜ĐµĐ´Đ½Đ°ĐºĐ° у Đ´Đ¾ÑÑ‚Đ¾Ñ˜Đ°Đ½ÑÑ‚Đ²Ñƒ и Đ¿Ñ€Đ°Đ²Đ¸Đ¼Đ°. ĐĐ½Đ° Ñу Đ¾Đ±Đ´Đ°Ñ€ĐµĐ½Đ° Ñ€Đ°Đ·ÑƒĐ¼Đ¾Đ¼ и ÑĐ²ĐµÑˆÑ›Ñƒ и Ñ‚Ñ€ĐµĐ±Đ° Ñ˜ĐµĐ´Đ½Đ¸ Đ¿Ñ€ĐµĐ¼Đ° Đ´Ñ€ÑƒĐ³Đ¸Đ¼Đ° да Đ¿Đ¾ÑÑ‚ÑƒĐ¿Đ°Ñ˜Ñƒ у Đ´ÑƒÑ…Ñƒ Đ±Ñ€Đ°Ñ‚ÑÑ‚Đ²Đ°.", "metadata": { "languages": [ "mkd" @@ -8856,7 +8856,7 @@ { "type": "NarrativeText", "element_id": "1e1d32ffc1c937e2dc9b3b4e6b8a1453", - "text": "Serbian (Latin) Sva ljudska bi\u0107a ra\u0111aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sve\u0161\u0107u i treba jedni prema drugima da postupaju u duhu bratstva.", + "text": "Serbian (Latin) Sva ljudska bića raÄ‘aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i svešću i treba jedni prema drugima da postupaju u duhu bratstva.", "metadata": { "languages": [ "hrv" @@ -8877,7 +8877,7 @@ { "type": "NarrativeText", "element_id": "6a973a162a71cdf61973afc03d10bb08", - "text": "Serer-Sine Wiin we naa \u00f1oowaa na adna, den fop mbodu no ke war na oxnu refna na den a jega o ngalaat umpi yiif um, le mbarin o me\u01adtootaa baa mbaag o \u00f1oow den fop no fog.", + "text": "Serer-Sine Wiin we naa ñoowaa na adna, den fop mbodu no ke war na oxnu refna na den a jega o ngalaat umpi yiif um, le mbarin o meÆ­tootaa baa mbaag o ñoow den fop no fog.", "metadata": { "languages": [ "som", @@ -8900,7 +8900,7 @@ { "type": "NarrativeText", "element_id": "201296ccbaf34300a62d4a087915bf84", - "text": "Seselwa Creole French Nou tou imen nou\u2019n ne dan laliberte ek legalite, dan nou dignite ek nou bann drwa. Nou tou nou annan kapasite pou rezonnen, e fodre nou azir anver lezot avek en lespri fraternel.", + "text": "Seselwa Creole French Nou tou imen nou’n ne dan laliberte ek legalite, dan nou dignite ek nou bann drwa. Nou tou nou annan kapasite pou rezonnen, e fodre nou azir anver lezot avek en lespri fraternel.", "metadata": { "languages": [ "fra" @@ -8921,7 +8921,7 @@ { "type": "UncategorizedText", "element_id": "f602d39c8cf6ba79e59adce09af30f26", - "text": "Shan \u1075\u1030\u107c\u103a\u1038\u1075\u1030\u108a\u1075\u1031\u1083\u1089\u107c\u1086\u1089 \u1015\u1035\u107c\u103a\u1022\u107c\u103a\u1075\u102d\u1030\u1010\u103a\u1087\u1019\u1083\u1038\u101c\u1030\u107a\u103a\u1088\u1075\u102f\u1004\u103a\u1087\u1019\u102f\u107c\u103a\u1022\u107c\u103a\u101c\u103d\u1010\u103a\u1088\u101c\u1085\u101d\u103a\u1038\u107d\u1035\u1004\u103a\u1087\u1015\u1035\u1004\u103a\u1038\u1075\u107c\u103a \u101c\u1084\u1088 \u101e\u102f\u107c\u103a\u1087\u101c\u1086\u1088\u1022\u107c\u103a \u101c\u103d\u1010\u103a\u1088\u101c\u1085\u101d\u103a\u1038\u107d\u1035\u1004\u103a\u1087 \u1015\u1035\u1004\u103a\u1038\u1075\u107c\u103a\u104b \u1076\u101d\u103a\u107c\u1086\u1089 \u1019\u102e\u1038\u107a\u1062\u107c\u103a\u1087\u1022\u107c\u103a\u1019\u1031\u1083\u1011\u1010\u103a\u1038\u101e\u1062\u1004\u103a \u101c\u1084\u1088 \u1078\u1082\u103a\u1022\u107c\u103a\u1081\u1030\u1089\u1078\u1075\u103a\u1038\u107e\u102d\u1004\u103a\u1088\u1010\u102d\u102f\u101d\u103a\u1038\u1075\u1019\u103a \u107c\u107c\u103a\u1089\u101c\u1084\u1088 \u1011\u102f\u1075\u103a\u1087\u101d\u1086\u1089\u1078\u1082\u103a\u1015\u102e\u1088\u1022\u103d\u1075\u103a\u1087 \u107c\u103d\u1004\u103a\u1089\u1076\u1086\u1087\u1075\u107c\u103a\u101e\u1031 \u1010\u102d\u1010\u103a\u1038\u1010\u1031\u1083\u1087\u1075\u107c\u103a\u104b", + "text": "Shan áµá€°á¼á€ºá€¸áµá€°á‚áµá€±á‚ƒá‚‰á¼á‚†á‚‰ ပဵá¼á€ºá€¢á¼á€ºáµá€­á€°á€á€ºá‚‡á€™á‚ƒá€¸á€œá€°áºá€ºá‚ˆáµá€¯á€„်ႇမုá¼á€ºá€¢á¼á€ºá€œá€½á€á€ºá‚ˆá€œá‚…á€á€ºá€¸á½á€µá€„်ႇပဵင်းáµá¼á€º လႄႈ á€á€¯á¼á€ºá‚‡á€œá‚†á‚ˆá€¢á¼á€º လွá€á€ºá‚ˆá€œá‚…á€á€ºá€¸á½á€µá€„်ႇ ပဵင်းáµá¼á€ºá‹ á¶á€á€ºá¼á‚†á‚‰ မီးáºá¢á¼á€ºá‚‡á€¢á¼á€ºá€™á€±á‚ƒá€‘á€á€ºá€¸á€á¢á€„် လႄႈ á¸á‚‚်ဢá¼á€ºá‚ူႉá¸áµá€ºá€¸á¾á€­á€„်ႈá€á€­á€¯á€á€ºá€¸áµá€™á€º á¼á¼á€ºá‚‰á€œá‚„ႈ ထုáµá€ºá‚‡á€á‚†á‚‰á¸á‚‚်ပီႈဢွáµá€ºá‚‡ á¼á€½á€„်ႉá¶á‚†á‚‡áµá¼á€ºá€á€± á€á€­á€á€ºá€¸á€á€±á‚ƒá‚‡áµá¼á€ºá‹", "metadata": { "filetype": "text/plain", "data_source": { @@ -8962,7 +8962,7 @@ { "type": "NarrativeText", "element_id": "20e37b3914fade183f3e76b200daccbd", - "text": "Shilluk Dhanh\u00f8 b\u00ebne ba anyw\u00f8l\u00f8 e path ki b\u00e4ng, ge p\u00e4r ki yij b\u00eb\u00ebd\u00f8 geki dy\u00ebr\u00f8. g\u00efn-a dwaddi kiper gen y\u00ef gen da rumi ki b\u00eb\u00ebd\u00f8 m\u00f8 g\u00f6\u00f6g gen ki py\u00ebw akyel ga nyim\u00ebgg.", + "text": "Shilluk Dhanhø bĂ«ne ba anywølø e path ki bäng, ge pär ki yij bëëdø geki dyĂ«rø. gĂ¯n-a dwaddi kiper gen yĂ¯ gen da rumi ki bëëdø mø göög gen ki pyĂ«w akyel ga nyimĂ«gg.", "metadata": { "languages": [ "ind", @@ -8986,7 +8986,7 @@ { "type": "NarrativeText", "element_id": "9be888269d99ba5b9d4200b2a6d65346", - "text": "Shipibo-Conibo Jat\u00edbi joninra huetsa jonibaon yoiai ninc\u00e1resti iqui, jahueraquibi jaconmai iamaquin; jainoash jahuen queena jacon jahu\u00e9quibo ati jahuequescamabi iqui, tsonbira amayamatima iqui. Jaticashbira jascara aresti jacon shinanya iti jahuequescamabi iqui, jahuequescarainoash picota joni inonbi. Huestiora huestiorabora jahu\u00e9qui ati shinanya iqui; jainshon onanribique jahueratoqui jacon iqui jainoash jaconma iqui ishon. Ja copira huetsa jonibires inonbi non jato jaconharesti iqui, non huetsabi non acai quescaaquin.", + "text": "Shipibo-Conibo JatĂ­bi joninra huetsa jonibaon yoiai nincĂ¡resti iqui, jahueraquibi jaconmai iamaquin; jainoash jahuen queena jacon jahuĂ©quibo ati jahuequescamabi iqui, tsonbira amayamatima iqui. Jaticashbira jascara aresti jacon shinanya iti jahuequescamabi iqui, jahuequescarainoash picota joni inonbi. Huestiora huestiorabora jahuĂ©qui ati shinanya iqui; jainshon onanribique jahueratoqui jacon iqui jainoash jaconma iqui ishon. Ja copira huetsa jonibires inonbi non jato jaconharesti iqui, non huetsabi non acai quescaaquin.", "metadata": { "languages": [ "cat", @@ -9030,7 +9030,7 @@ { "type": "NarrativeText", "element_id": "98765accca3aa276e32acc6ddb665f01", - "text": "Shor \u041f\u0430\u0440\u0447\u044b\u043d \u043a\u0438\u0436\u0438, \u043f\u043e \u0447\u0430\u0440\u044b\u049b\u049b\u0430 \u0442\u0443\u0493\u0447\u0430\u0434\u044b\u043f, \u0442\u0435\u04a3, \u043f\u043e\u0448 \u0442\u0443\u0493\u0447\u0430. \u041a\u0438\u0436\u0438\u043b\u0435\u0440 \u0441\u0430\u0493\u044b\u0448\u0442\u044b\u0493, \u0430\u049b\u0442\u044b\u0493 \u0442\u0443\u0493\u0447\u0430\u043b\u0430\u0440, \u043a\u0438\u0436\u0438\u043b\u0435\u0440\u0433\u0435 \u043f\u0430\u0448\u049b\u0430 \u043a\u0438\u0436\u0438\u043b\u0435\u0440\u0431\u0435 \u0430\u0440\u0493\u044b\u0448\u0442\u0430\u043d\u044b\u0448\u0442\u0430\u0440\u0493\u0430 \u043a\u0435\u0440\u0435\u043a.", + "text": "Shor ĐŸĐ°Ñ€Ñ‡Ñ‹Đ½ ĐºĐ¸Đ¶Đ¸, Đ¿Đ¾ Ñ‡Đ°Ñ€Ñ‹̉›̉›Đ° ту̉“Ñ‡Đ°Đ´Ñ‹Đ¿, Ñ‚Đµ̉£, Đ¿Đ¾Ñˆ ту̉“Ñ‡Đ°. ĐĐ¸Đ¶Đ¸Đ»ĐµÑ€ Ñа̉“ышты̉“, а̉›Ñ‚Ñ‹̉“ ту̉“Ñ‡Đ°Đ»Đ°Ñ€, ĐºĐ¸Đ¶Đ¸Đ»ĐµÑ€Đ³Đµ Đ¿Đ°Ñˆ̉›Đ° ĐºĐ¸Đ¶Đ¸Đ»ĐµÑ€Đ±Đµ Đ°Ñ€̉“Ñ‹ÑˆÑ‚Đ°Đ½Ñ‹ÑˆÑ‚Đ°Ñ€̉“а ĐºĐµÑ€ĐµĐº.", "metadata": { "languages": [ "rus" @@ -9051,7 +9051,7 @@ { "type": "NarrativeText", "element_id": "06b44e2713d2ab9cbfdbffecc788465a", - "text": "Shuar Aents yaj\u00e1 nunkanam ak\u00ednia asamtaish, metekrak ainiaji. Tumasha ni chichamenka tuke amiktin a\u00edniawai. Ni iniakmamuri, n\u00ed chichamejaituke aniakmamsar chichakartin a\u00edniawai. Tuma asamtai aents mash nekawar, penker metekrak, nuamtak war\u00e1 warat shiir pujusarmi tusar a\u00e1rma awai.", + "text": "Shuar Aents yajĂ¡ nunkanam akĂ­nia asamtaish, metekrak ainiaji. Tumasha ni chichamenka tuke amiktin aĂ­niawai. Ni iniakmamuri, nĂ­ chichamejaituke aniakmamsar chichakartin aĂ­niawai. Tuma asamtai aents mash nekawar, penker metekrak, nuamtak warĂ¡ warat shiir pujusarmi tusar aĂ¡rma awai.", "metadata": { "languages": [ "ind", @@ -9073,7 +9073,7 @@ { "type": "NarrativeText", "element_id": "8e0cb1b65226a998ba0e2831e44dbe49", - "text": "Sidamo Manchi beetti kalaqamunni wolaphinoho. Ayirrinyunninna qoossotennino taaloho. Huwatanno tiiano kalaqamunni ba\u2019raarinoha ikkasinni mittu wolu ledo rodiimmate ayyaaninni hee\u2019ra noosi.", + "text": "Sidamo Manchi beetti kalaqamunni wolaphinoho. Ayirrinyunninna qoossotennino taaloho. Huwatanno tiiano kalaqamunni ba’raarinoha ikkasinni mittu wolu ledo rodiimmate ayyaaninni hee’ra noosi.", "metadata": { "languages": [ "fin", @@ -9096,7 +9096,7 @@ { "type": "NarrativeText", "element_id": "1129172b2baa1c40a3ab800d0d28f02b", - "text": "Sinhala \u0dc3\u0dd2\u0dba\u0dbd\u0dd4 \u0db8\u0db1\u0dd4\u0dc2\u0dca\u200d\u0dba\u0dba\u0ddd \u0db1\u0dd2\u0daf\u0dc4\u0dc3\u0dca\u0dc0 \u0d8b\u0db4\u0dad \u0dbd\u0db6\u0dcf \u0d87\u0dad. \u0d9c\u0dbb\u0dd4\u0dad\u0dca\u0dc0\u0dba\u0dd9\u0db1\u0dca \u0dc4\u0dcf \u0d85\u0dba\u0dd2\u0dad\u0dd2\u0dc0\u0dcf\u0dc3\u0dd2\u0d9a\u0db8\u0dca\u0dc0\u0dbd\u0dd2\u0db1\u0dca \u0dc3\u0db8\u0dcf\u0db1 \u0dc0\u0dd9\u0dad\u0dd2. \u0dba\u0dd4\u0d9a\u0dca\u0dad\u0dd2 \u0d85\u0dba\u0dd4\u0d9a\u0dca\u0dad\u0dd2 \u0db4\u0dd2\u0dc5\u0dd2\u0db6\u0db3 \u0dc4\u0dd0\u0d9f\u0dd3\u0db8\u0dd9\u0db1\u0dca \u0dc4\u0dcf \u0dc4\u0dd8\u0daf\u0dba \u0dc3\u0dcf\u0d9a\u0dca\u0dc2\u0dd2\u0dba\u0dd9\u0db1\u0dca \u0dba\u0dd4\u0dad\u0dca \u0d94\u0dc0\u0dd4\u0db1\u0dca, \u0d94\u0dc0\u0dd4\u0db1\u0ddc\u0dc0\u0dd4\u0db1\u0dca\u0da7 \u0dc3\u0dd0\u0dc5\u0d9a\u0dd2\u0dba \u0dba\u0dd4\u0dad\u0dca\u0dad\u0dda \u0dc3\u0dc4\u0ddd\u0daf\u0dbb\u0dad\u0dca\u0dc0\u0dba \u0db4\u0dd2\u0dc5\u0dd2\u0db6\u0db3 \u0dc4\u0dd0\u0d9f\u0dd3\u0db8\u0dd9\u0db1\u0dd2.", + "text": "Sinhala සියලු මනුෂà·â€à¶ºà¶ºà· නිදහසà·à·€ à¶‹à¶´à¶­ ලබ෠ඇත. ගරුතà·à·€à¶ºà·™à¶±à· හ෠අයිතිවà·à·ƒà·’à¶à¶¸à·à·€à¶½à·’න෠සමà·à¶± වෙති. යුà¶à·à¶­à·’ අයුà¶à·à¶­à·’ à¶´à·’à·…à·’à¶¶à¶³ à·„à·à¶Ÿà·“මෙන෠හ෠හෘදය à·ƒà·à¶à·à·‚ියෙන෠යුත෠ඔවුනà·, ඔවුනොවුනà·à¶§ à·ƒà·à·…à¶à·’ය යුතà·à¶­à· සහà·à¶¯à¶»à¶­à·à·€à¶º à¶´à·’à·…à·’à¶¶à¶³ à·„à·à¶Ÿà·“මෙනි.", "metadata": { "filetype": "text/plain", "data_source": { @@ -9114,7 +9114,7 @@ { "type": "NarrativeText", "element_id": "7f18ad35feab9b6f20b97d87856143c8", - "text": "Siona Sia'bai\u0331 aideo'y\u00eb goa'ye beoye gu\u0331i'ne sia'y\u00eb'qu\u00eb maca bai'ye gu\u0331i'ne qu\u00ebco baye co\u0331ni, ja\u0331je\u0331 goachase's\u00ebte goa'ju\u0331i'\u00f1e beoye \u00f1ese saiye bai'ji co\u0331caij\u00eb yequ\u00eb bai\u0331reje.", + "text": "Siona Sia'baì± aideo'yĂ« goa'ye beoye gù±i'ne sia'yĂ«'quĂ« maca bai'ye gù±i'ne quĂ«co baye cò±ni, jà±jè± goachase'sĂ«te goa'jù±i'ñe beoye ñese saiye bai'ji cò±caijĂ« yequĂ« baì±reje.", "metadata": { "languages": [ "sqi", @@ -9137,7 +9137,7 @@ { "type": "NarrativeText", "element_id": "c82f4633a9724d1de7dfe866d1429080", - "text": "Slovak V\u0161etci \u013eudia sa rodia slobodn\u00ed a sebe rovn\u00ed , \u010do sa t\u00fdka ich dostojnosti a pr\u00e1v. S\u00fa obdaren\u00ed rozumom a maj\u00fa navz\u00e1jom jedna\u0165 v bratskom duchu.", + "text": "Slovak VÅ¡etci ľudia sa rodia slobodnĂ­ a sebe rovnĂ­ , Äo sa tĂ½ka ich dostojnosti a prĂ¡v. SĂº obdarenĂ­ rozumom a majĂº navzĂ¡jom jednaÅ¥ v bratskom duchu.", "metadata": { "languages": [ "slk" @@ -9200,7 +9200,7 @@ { "type": "NarrativeText", "element_id": "5d86d8cbc9dda45558ccf60a3974e66a", - "text": "Soninke Haadama renme su saareyen \u014ba an na du-kitten \u00f1a, an nta sere komaaxu, an do soron su yan yekka dorontaaxu do taqu. Haqilen, wa sere su, a do soro kuttu nan siri terene doome kappalengaaxu kanma.", + "text": "Soninke Haadama renme su saareyen Å‹a an na du-kitten ña, an nta sere komaaxu, an do soron su yan yekka dorontaaxu do taqu. Haqilen, wa sere su, a do soro kuttu nan siri terene doome kappalengaaxu kanma.", "metadata": { "languages": [ "som", @@ -9223,7 +9223,7 @@ { "type": "NarrativeText", "element_id": "2254a39b8eef4c825a973c26eb9364c9", - "text": "Sorbian, Upper W\u0161itcy \u010d\u0142owjekojo su wot naroda swobodni a su jenacy po dostojnos\u0107i a prawach. Woni su z rozumom a sw\u011bdomjom wobdarjeni a maja mjezsobu w duchu bratrowstwa wobchad\u017ae\u0107.", + "text": "Sorbian, Upper WÅ¡itcy ÄÅ‚owjekojo su wot naroda swobodni a su jenacy po dostojnosći a prawach. Woni su z rozumom a swÄ›domjom wobdarjeni a maja mjezsobu w duchu bratrowstwa wobchadźeć.", "metadata": { "languages": [ "pol", @@ -9245,7 +9245,7 @@ { "type": "NarrativeText", "element_id": "f6b37545577a2f9471636b40acbc5bf3", - "text": "Sotho, Northern Batho ka moka ba belegwe ba lokologile le gona ba na le seriti sa go lekana le ditokelo. Ba filwe monagano le letswalo mme ba swanet\u0161e go swarana ka moya wa bana ba mpa.", + "text": "Sotho, Northern Batho ka moka ba belegwe ba lokologile le gona ba na le seriti sa go lekana le ditokelo. Ba filwe monagano le letswalo mme ba swanetÅ¡e go swarana ka moya wa bana ba mpa.", "metadata": { "languages": [ "tgl", @@ -9292,7 +9292,7 @@ { "type": "UncategorizedText", "element_id": "51733b425e93924dbea419a28d2ee3d2", - "text": "South Azerbaijani Tu\u0308m insanlar hu\u0308r do\u0308g\u0306arlar, hak ve onur bak\u0131m\u0131ndan es\u0327it do\u0308g\u0306arlar, onlar ak\u0131l ve vicdana sahiptirler ve birbirlerine kars\u0327\u0131 kardes\u0327lik ruhu ic\u0327inde davranmal\u0131lar.", + "text": "South Azerbaijani Tùˆm insanlar hùˆr dòˆg̀†arlar, hak ve onur bakımından es̀§it dòˆg̀†arlar, onlar akıl ve vicdana sahiptirler ve birbirlerine kars̀§Ä± kardes̀§lik ruhu ic̀§inde davranmalılar.", "metadata": { "languages": [ "tur" @@ -9313,7 +9313,7 @@ { "type": "NarrativeText", "element_id": "7c2e8d871037d3d152d88dc5510cb236", - "text": "Spanish Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como est\u00e1n de raz\u00f3n y conciencia, deben comportarse fraternalmente los unos con los otros.", + "text": "Spanish Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como estĂ¡n de razĂ³n y conciencia, deben comportarse fraternalmente los unos con los otros.", "metadata": { "languages": [ "spa" @@ -9334,7 +9334,7 @@ { "type": "NarrativeText", "element_id": "816bdd2e0af6f8cc514fe60150f4714b", - "text": "Spanish (resolution) Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como est\u00e1n de raz\u00f3n y conciencia, deben comportarse fraternalmente los unos con los otros.", + "text": "Spanish (resolution) Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como estĂ¡n de razĂ³n y conciencia, deben comportarse fraternalmente los unos con los otros.", "metadata": { "languages": [ "spa" @@ -9398,7 +9398,7 @@ { "type": "NarrativeText", "element_id": "cf93d32f84284c9d205953f2720290ba", - "text": "Susu Adamadie birin barixin\u025b e lan y\u025bt\u025bralui kui, y\u025bt\u025b kolonyi nun y\u025bt\u025b suxu kima. Fondoe nun faxamui na e b\u025b boresuxu kima bariboreya fanyi kui.", + "text": "Susu Adamadie birin barixinÉ› e lan yÉ›tÉ›ralui kui, yÉ›tÉ› kolonyi nun yÉ›tÉ› suxu kima. Fondoe nun faxamui na e bÉ› boresuxu kima bariboreya fanyi kui.", "metadata": { "languages": [ "som", @@ -9464,7 +9464,7 @@ { "type": "NarrativeText", "element_id": "962be1c35a09978ec0be3e93852b6925", - "text": "Swedish Alla m\u00e4nniskor \u00e4ro f\u00f6dda fria och lika i v\u00e4rde och r\u00e4ttigheter. De \u00e4ro utrustade med f\u00f6rnuft och samvete och b\u00f6ra handla gentemot varandra i en anda av broderskap.", + "text": "Swedish Alla människor äro födda fria och lika i värde och rättigheter. De äro utrustade med förnuft och samvete och böra handla gentemot varandra i en anda av broderskap.", "metadata": { "languages": [ "swe" @@ -9506,7 +9506,7 @@ { "type": "NarrativeText", "element_id": "1fbce46911c4817cf2f0bf0db19d2f32", - "text": "Tagalog (Tagalog) \u1700\u1705 \u170e\u1711\u1706\u1714 \u1705 \u1706\u1702\u170c\u1714 \u1701\u1710\u1712\u1708\u1712\u170e\u1705 \u1708 \u170b\u170e\u170c \u1700\u1706\u1714 \u1709\u1708\u1714\u1706\u170c\u1714 \u1709\u1708\u1714\u1706\u170c\u1714 \u1710 \u1703\u1707\u1705\u170e\u1708\u1714 \u1700\u1706\u1714 \u170b\u1714\u1704 \u1703\u1707\u1713\u1709\u1706\u1708\u1714\u1736 \u1710\u1712\u170e\u170c\u1714 \u1709\u1712\u1708\u1704\u1714\u1703\u170e\u1713\u170a\u1708\u1714 \u1705 \u1703\u1706\u1714\u170f\u1712\u1707\u1708\u1714 \u1700\u1706\u1714 \u170a\u1713\u1707\u1714\u1711\u1712 \u1700\u1706\u1714 \u1707\u1709\u1706\u1714 \u170b\u1704\u1714\u1709\u170e\u1704\u170c\u1708\u1714 \u1700\u1705 \u1701\u1710\u1706\u1714 \u1701\u1710 \u1710 \u1707\u1712\u170f \u1705 \u1709\u1704\u1714\u1703\u1703\u1709\u1706\u1712\u1707\u1708\u1714\u1736", + "text": "Tagalog (Tagalog) ᜀᜅ áœáœ‘ᜆ᜔ ᜅ ᜆᜂᜌ᜔ áœáœáœ’ᜈᜒáœáœ… ᜈ ᜋáœáœŒ ᜀᜆ᜔ ᜉᜈ᜔ᜆᜌ᜔ ᜉᜈ᜔ᜆᜌ᜔ ᜠᜃᜇᜅáœáœˆáœ” ᜀᜆ᜔ ᜋ᜔ᜄ ᜃᜇᜓᜉᜆᜈ᜔᜶ áœáœ’áœáœŒáœ” ᜉᜒᜈᜄ᜔ᜃáœáœ“áœáœˆáœ” ᜅ ᜃᜆ᜔áœáœ’ᜇᜈ᜔ ᜀᜆ᜔ áœáœ“ᜇ᜔ᜑᜒ ᜀᜆ᜔ ᜇᜉᜆ᜔ ᜋᜄ᜔ᜉáœáœ„ᜌᜈ᜔ ᜀᜅ áœáœáœ†áœ” áœáœ ᜠᜇᜒᜠᜅ ᜉᜄ᜔ᜃᜃᜉᜆᜒᜇᜈ᜔᜶", "metadata": { "filetype": "text/plain", "data_source": { @@ -9524,7 +9524,7 @@ { "type": "NarrativeText", "element_id": "f80202b3162be68cd2957c5c564ddc03", - "text": "Tahitian E fanauhia te t\u0101'\u0101to'ara'a o te ta'ata-tupu ma te ti'am\u0101 e te ti'amanara'a 'aifaito. Ua '\u012b te mana'o pa'ari e i te manava e ma te 'a'au taea'e 'oia ta ratou ha'a i rotop\u016b ia ratou iho, e ti'a ai;", + "text": "Tahitian E fanauhia te tÄ'Äto'ara'a o te ta'ata-tupu ma te ti'amÄ e te ti'amanara'a 'aifaito. Ua 'Ä« te mana'o pa'ari e i te manava e ma te 'a'au taea'e 'oia ta ratou ha'a i rotopÅ« ia ratou iho, e ti'a ai;", "metadata": { "languages": [ "ita" @@ -9545,7 +9545,7 @@ { "type": "UncategorizedText", "element_id": "b5b3558a1982151293ab4f2c745e943b", - "text": "Tai Dam \uaab9\uaa95\uaab8\uaa89 \uaa80\uaab1 \uaa8b\uaab4 \uaadb \uaa8e\uaab2\uaa89 \uaaae\uaaae\uaa80 \uaaa3\uaab1 \uaabb\uaaa0 \uaa81\uaab7 \uaabb\uaaac \uaabc\uaa92 \uaa95\uaab3 \uaa95\uaab1\uaa89 \uaa80\uaabe\uaa9a \uaab9\uaa8b\uaab7\uaa89 \uaa9d\uaab8\uaa89 \uaa95\uaaae\uaaa5 \uaaa9\uaabe \uaadb \uaab6\uaa94\uaa99 \uaaa0\uaab4 - \uaa8b\uaab4 \uaaac\uaaba \uaadb \uaabb\uaaa0 \uaa81\uaab7 \uaabb\uaaac \uaaa3\uaab2 \uaa81\uaaab\uaab8\uaa99 \uaa8e\uaab1\uaa89 \uaab6\uaa8e\uaaa3 \uaaa9\uaaba\uaa89 \uaab9\uaaa5\uaab8\uaa92 \uaadb \uaa80\uaabe\uaa9a \uaab9\uaaa5\uaab8\uaa92 \uaabb\uaa8a \uaa9a\uaab4\uaa99 \uaa80\uaabe\uaa9a \uaabc\uaa92 \uaab9\uaa9a\uaab7\uaa89 \uaa92\uaab2 \uaa80\uaabe\uaa9a \uaaab\uaab8\uaa80 \uaaad\uaab0\uaa80 \uaab5\uaa9d\uaa89 \uaab9\uaa8f\uaa89 \uaab9\uaaad\uaa99 \uaa92\uaab8\uaaab.", + "text": "Tai Dam ꪹꪕꪸꪉ ꪀꪱ ꪋꪴ ê«› êªêª²êª‰ ꪮꪮꪀ ꪣꪱ ꪻꪠ êªêª· ꪻꪬ ꪼꪒ ꪕꪳ ꪕꪱꪉ êª€êª¾êª êª¹êª‹êª·êª‰ êªêª¸êª‰ ꪕꪮꪥ ꪩꪾ ê«› ꪶꪔꪙ ꪠꪴ - ꪋꪴ ꪬꪺ ê«› ꪻꪠ êªêª· ꪻꪬ ꪣꪲ êªêª«êª¸êª™ êªêª±êª‰ ꪶêªêª£ ꪩꪺꪉ ꪹꪥꪸꪒ ê«› êª€êª¾êª êª¹êª¥êª¸êª’ êª»êª êªêª´êª™ êª€êª¾êª êª¼êª’ ꪹêªêª·êª‰ ꪒꪲ êª€êª¾êª êª«êª¸êª€ ꪭꪰꪀ ꪵêªêª‰ ꪹêªêª‰ ꪹꪭꪙ ꪒꪸꪫ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -9563,7 +9563,7 @@ { "type": "NarrativeText", "element_id": "424be8d53e2447fd43a7df9c88610eb3", - "text": "Tajiki \u0422\u0430\u043c\u043e\u043c\u0438 \u043e\u0434\u0430\u043c\u043e\u043d \u043e\u0437\u043e\u0434 \u0432\u0430 \u0430\u0437 \u043b\u0438\u04b3\u043e\u0437\u0438 \u0448\u0430\u0440\u0430\u0444\u0443 \u04b3\u0443\u049b\u0443\u049b \u0431\u0430 \u04b3\u0430\u043c \u0431\u0430\u0440\u043e\u0431\u0430\u0440 \u0431\u0430 \u0434\u0443\u043d\u0451 \u043c\u0435\u043e\u044f\u043d\u0434. \u041e\u043d\u04b3\u043e \u0441\u043e\u04b3\u0438\u0431\u0438 \u0430\u049b\u043b\u0443 \u0432\u0438\u04b7\u0434\u043e\u043d\u0430\u043d\u0434 \u0432\u0430 \u0431\u043e\u044f\u0434 \u0431\u043e \u044f\u043a\u0434\u0438\u0433\u0430\u0440 \u043c\u0443\u043d\u043e\u0441\u0438\u0431\u0430\u0442\u0438 \u0431\u0430\u0440\u043e\u0434\u0430\u0440\u043e\u043d\u0430 \u0434\u043e\u0448\u0442\u0430 \u0431\u043e\u0448\u0430\u043d\u0434.", + "text": "Tajiki Đ¢Đ°Đ¼Đ¾Đ¼Đ¸ Đ¾Đ´Đ°Đ¼Đ¾Đ½ Đ¾Đ·Đ¾Đ´ Đ²Đ° аз ли̉³Đ¾Đ·Đ¸ ÑˆĐ°Ñ€Đ°Ñ„Ñƒ ̉³Ñƒ̉›Ñƒ̉› ба ̉³Đ°Đ¼ Đ±Đ°Ñ€Đ¾Đ±Đ°Ñ€ ба Đ´ÑƒĐ½Ñ‘ Đ¼ĐµĐ¾ÑĐ½Đ´. ĐĐ½̉³Đ¾ ÑĐ¾̉³Đ¸Đ±Đ¸ а̉›Đ»Ñƒ Đ²Đ¸̉·Đ´Đ¾Đ½Đ°Đ½Đ´ Đ²Đ° Đ±Đ¾ÑĐ´ Đ±Đ¾ ÑĐºĐ´Đ¸Đ³Đ°Ñ€ Đ¼ÑƒĐ½Đ¾ÑĐ¸Đ±Đ°Ñ‚Đ¸ Đ±Đ°Ñ€Đ¾Đ´Đ°Ñ€Đ¾Đ½Đ° Đ´Đ¾ÑˆÑ‚Đ° Đ±Đ¾ÑˆĐ°Đ½Đ´.", "metadata": { "languages": [ "mkd", @@ -9586,7 +9586,7 @@ { "type": "NarrativeText", "element_id": "30aa2c0edeca02853a028f15110a6827", - "text": "Talysh H\u0259mm\u0259 insonon b\u0259\u015ft\u0259 l\u0259yo\u011f\u0259ti iy\u0259n h\u0259xonro ozod iy\u0259n b\u0259rob\u0259r movard\u0259 bed\u0259n. \u00c7\u0259von \u015fuur iy\u0259n vicdon hese, \u0259ve ki, dey\u0259nd\u0131 m\u0131nasib\u0259t\u0259d\u0259 b\u0259n\u0259 b\u0131v\u0259 r\u0259ftor kard\u0259ninin.", + "text": "Talysh HÉ™mmÉ™ insonon bəştÉ™ lÉ™yoÄŸÉ™ti iyÉ™n hÉ™xonro ozod iyÉ™n bÉ™robÉ™r movardÉ™ bedÉ™n. Çəvon ÅŸuur iyÉ™n vicdon hese, É™ve ki, deyÉ™ndı mınasibÉ™tÉ™dÉ™ bÉ™nÉ™ bıvÉ™ rÉ™ftor kardÉ™ninin.", "metadata": { "languages": [ "tur" @@ -9607,7 +9607,7 @@ { "type": "UncategorizedText", "element_id": "615dde6386c8f1b795ccd07901216ce7", - "text": "Tamang, Eastern \u092e\u094d\u0939\u094b\u0915\u094d\u0915\u094b\u0928 (\u0917\u094b\u0926\u094b\u092a) \u0928\u094b\u0928 \u092e\u094d\u0939\u0940\u092e \u0915\u0947\u092a\u093e\u0928\u094d\u0939\u093e\u092a\u093e \u0939\u0947\u0928\u094d\u091b\u0947 \u0928\u0941\u0928 \u0939\u093e\u0919\u092a\u093e\u0919\u0935\u093e (\u0938\u094d\u0935\u0924\u0928\u094d\u0924\u094d\u0930) \u092f\u093e\u0919\u0935\u093e \u0939\u0940\u0928\u094d\u0928\u093e \u0964 \u0925\u0947 \u092e\u094d\u0939\u094b\u0915\u094d\u0915\u094b\u0928\u0932\u093e (\u0917\u094b\u0926\u094b\u092a\u0932\u093e) \u091a\u094d\u092f\u094b\u091a\u094d\u092f\u094b \u092f\u093e\u0919\u0924\u093e\u092e \u0925\u0947\u0928 \u092e\u0939\u0924\u094d\u0935 \u092e\u0941\u0932\u093e \u0964 \u0925\u0947\u0928\u0940\u0915\u093e\u0926\u0947\u0930\u0940 \u0938\u0947\u092e\u092c\u093e\u0919 (\u0935\u093f\u091a\u093e\u0930 \u0936\u0915\u094d\u0924\u093f) \u0926\u0947\u0928 \u0925\u0941-\u0938\u0947\u092e\u0938\u093e\u0919 \u092e\u0941\u092c\u093e\u0938\u0947 \u0925\u0947\u0928\u0940\u091c\u0941\u0917\u0941\u0938\u0947 \u0939\u094d\u0930\u093e\u0919\u0928\u094d\u0939\u093e\u0919\u0930\u0940 \u0928\u0941\u0928 \u0925\u0947\u0924\u094d\u092e\u093e\u0932\u093e \u0938\u0947\u092e\u0932\u0947\u0919\u092e\u094b\u0917\u094d\u092f\u093e\u092e\u094d\u0938\u0947 (\u092d\u0935\u0928\u093e\u092c\u093e\u091f) \u0917\u094d\u092f\u0947 \u0932\u093e\u0924\u094b\u092c\u093e\u0928 \u092e\u0941\u0932\u093e \u0964", + "text": "Tamang, Eastern मà¥à¤¹à¥‹à¤•à¥à¤•ोन (गोदोप) नोन मà¥à¤¹à¥€à¤® केपानà¥à¤¹à¤¾à¤ªà¤¾ हेनà¥à¤›à¥‡ नà¥à¤¨ हाङपाङवा (सà¥à¤µà¤¤à¤¨à¥à¤¤à¥à¤°) याङवा हीनà¥à¤¨à¤¾ । थे मà¥à¤¹à¥‹à¤•à¥à¤•ोनला (गोदोपला) à¤à¥à¤¯à¥‹à¤à¥à¤¯à¥‹ याङताम थेन महतà¥à¤µ मà¥à¤²à¤¾ । थेनीकादेरी सेमबाङ (विà¤à¤¾à¤° शकà¥à¤¤à¤¿) देन थà¥-सेमसाङ मà¥à¤¬à¤¾à¤¸à¥‡ थेनीजà¥à¤—à¥à¤¸à¥‡ हà¥à¤°à¤¾à¤™à¤¨à¥à¤¹à¤¾à¤™à¤°à¥€ नà¥à¤¨ थेतà¥à¤®à¤¾à¤²à¤¾ सेमलेङमोगà¥à¤¯à¤¾à¤®à¥à¤¸à¥‡ (भवनाबाट) गà¥à¤¯à¥‡ लातोबान मà¥à¤²à¤¾ ।", "metadata": { "languages": [ "nep" @@ -9628,7 +9628,7 @@ { "type": "NarrativeText", "element_id": "f484ee723443631e755f61ec59737260", - "text": "Tamazight, Central Atlas Imdanen, akken ma llan ttlalen d ilelliyen msawan di lh\u0323wer\u0323ma d yizerfan- ghur sen tamsakwit d l\u00e2quel u yessefk ad-tili tegmatt gar asen.", + "text": "Tamazight, Central Atlas Imdanen, akken ma llan ttlalen d ilelliyen msawan di lh̀£wer̀£ma d yizerfan- ghur sen tamsakwit d lĂ¢quel u yessefk ad-tili tegmatt gar asen.", "metadata": { "languages": [ "tur", @@ -9650,7 +9650,7 @@ { "type": "UncategorizedText", "element_id": "4fa699fe9b09ce455b4b7a0eceac23a4", - "text": "Tamazight, Central Atlas (Tifinagh) \u2d49\u2d4e\u2d37\u2d30\u2d4f\u2d3b\u2d4f, \u2d30\u2d3d\u2d3d\u2d3b\u2d4f \u2d4e\u2d30 \u2d4d\u2d4d\u2d30\u2d4f \u2d5c\u2d5c\u2d4d\u2d30\u2d4d\u2d3b\u2d4f \u2d37 \u2d49\u2d4d\u2d3b\u2d4d\u2d4d\u2d49\u2d62\u2d3b\u2d4f \u2d4e\u2d59\u2d30\u2d61\u2d30\u2d4f \u2d37\u2d49 \u2d4d\u2d43\u2d61\u2d3b\u2d55\u2d4e\u2d30 \u2d37 \u2d62\u2d49\u2d63\u2d3b\u2d54\u2d3c\u2d30\u2d4f-\u2d56\u2d53\u2d54 \u2d59\u2d3b\u2d4f \u2d5c\u2d30\u2d4e\u2d59\u2d30\u2d3d\u2d61\u2d49\u2d5c \u2d37 \u2d4d\u2d30\u2d47\u2d53\u2d3b\u2d4d \u2d53 \u2d62\u2d3b\u2d59\u2d59\u2d3b\u2d3c\u2d3d \u2d30\u2d37-\u2d5c\u2d49\u2d4d\u2d49 \u2d5c\u2d3b\u2d33\u2d4e\u2d30\u2d5c\u2d5c \u2d33\u2d30\u2d54 \u2d30\u2d59\u2d3b\u2d4f.", + "text": "Tamazight, Central Atlas (Tifinagh) ⵉâµâ´·â´°âµâ´»âµ, ⴰⴽⴽⴻⵠâµâ´° âµâµâ´°âµ ⵜⵜâµâ´°âµâ´»âµ â´· ⵉâµâ´»âµâµâµ‰âµ¢â´»âµ âµâµ™â´°âµ¡â´°âµ ⴷⵉ âµâµƒâµ¡â´»âµ•âµâ´° â´· ⵢⵉⵣⴻⵔⴼⴰâµ-ⵖⵓⵔ ⵙⴻⵠⵜⴰâµâµ™â´°â´½âµ¡âµ‰âµœ â´· âµâ´°âµ‡âµ“ⴻⵠⵓ ⵢⴻⵙⵙⴻⴼⴽ â´°â´·-ⵜⵉâµâµ‰ ⵜⴻⴳâµâ´°âµœâµœ ⴳⴰⵔ ⴰⵙⴻâµ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -9668,7 +9668,7 @@ { "type": "UncategorizedText", "element_id": "c36059cd99076234366c10f07f278260", - "text": "Tamazight, Standard Morocan \u2d30\u2d54 \u2d37 \u2d5c\u2d5c\u2d4d\u2d30\u2d4d\u2d30\u2d4f \u2d4e\u2d49\u2d37\u2d37\u2d4f \u2d33\u2d30\u2d4f \u2d49\u2d4d\u2d3b\u2d4d\u2d4d\u2d49\u2d5c\u2d4f \u2d4e\u2d33\u2d30\u2d37\u2d37\u2d30\u2d4f \u2d56 \u2d61\u2d30\u2d37\u2d37\u2d53\u2d54 \u2d37 \u2d49\u2d63\u2d54\u2d3c\u2d30\u2d4f, \u2d62\u2d49\u2d4d\u2d49 \u2d30\u2d3d\u2d6f \u2d37\u2d30\u2d54\u2d59\u2d4f \u2d53\u2d4f\u2d4d\u2d4d\u2d49 \u2d37 \u2d53\u2d3c\u2d54\u2d30\u2d3d, \u2d49\u2d4d\u2d4d\u2d30 \u2d3c\u2d4d\u2d4d\u2d30 \u2d59\u2d4f \u2d30\u2d37 \u2d5c\u2d5c\u2d4e\u2d62\u2d30\u2d61\u2d30\u2d59\u2d4f \u2d4f\u2d33\u2d54\u2d30\u2d5c\u2d59\u2d4f \u2d59 \u2d5c\u2d30\u2d33\u2d4e\u2d30\u2d5c.", + "text": "Tamazight, Standard Morocan â´°âµ” â´· ⵜⵜâµâ´°âµâ´°âµ âµâµ‰â´·â´·âµ ⴳⴰⵠⵉâµâ´»âµâµâµ‰âµœâµ âµâ´³â´°â´·â´·â´°âµ âµ– ⵡⴰⴷⴷⵓⵔ â´· ⵉⵣⵔⴼⴰâµ, ⵢⵉâµâµ‰ ⴰⴽⵯ ⴷⴰⵔⵙⵠⵓâµâµâµâµ‰ â´· ⵓⴼⵔⴰⴽ, ⵉâµâµâ´° â´¼âµâµâ´° ⵙⵠⴰⴷ ⵜⵜâµâµ¢â´°âµ¡â´°âµ™âµ âµâ´³âµ”ⴰⵜⵙⵠⵙ ⵜⴰⴳâµâ´°âµœ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -9686,7 +9686,7 @@ { "type": "NarrativeText", "element_id": "703b672337c499aededf6f6696d6522f", - "text": "Tamil \u0bae\u0ba9\u0bbf\u0ba4\u0baa\u0bcd \u0baa\u0bbf\u0bb1\u0bbf\u0bb5\u0bbf\u0baf\u0bbf\u0ba9\u0bb0\u0bcd \u0b9a\u0b95\u0bb2\u0bb0\u0bc1\u0bae\u0bcd \u0b9a\u0bc1\u0ba4\u0ba8\u0bcd\u0ba4\u0bbf\u0bb0\u0bae\u0bbe\u0b95\u0bb5\u0bc7 \u0baa\u0bbf\u0bb1\u0b95\u0bcd\u0b95\u0bbf\u0ba9\u0bcd\u0bb1\u0ba9\u0bb0\u0bcd; \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0bae\u0ba4\u0bbf\u0baa\u0bcd\u0baa\u0bbf\u0bb2\u0bc1\u0bae\u0bcd, \u0b89\u0bb0\u0bbf\u0bae\u0bc8\u0b95\u0bb3\u0bbf\u0bb2\u0bc1\u0bae\u0bcd \u0b9a\u0bae\u0bae\u0bbe\u0ba9\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd, \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0ba8\u0bbf\u0baf\u0bbe\u0baf\u0ba4\u0bcd\u0ba4\u0bc8\u0baf\u0bc1\u0bae\u0bcd \u0bae\u0ba9\u0b9a\u0bcd\u0b9a\u0bbe\u0b9f\u0bcd\u0b9a\u0bbf\u0baf\u0bc8\u0baf\u0bc1\u0bae\u0bcd \u0b87\u0baf\u0bb1\u0bcd\u0baa\u0ba3\u0bcd\u0baa\u0bbe\u0b95\u0baa\u0bcd \u0baa\u0bc6\u0bb1\u0bcd\u0bb1\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd. \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0b92\u0bb0\u0bc1\u0bb5\u0bb0\u0bc1\u0b9f\u0ba9\u0bca\u0bb0\u0bc1\u0bb5\u0bb0\u0bcd \u0b9a\u0b95\u0bcb\u0ba4\u0bb0 \u0b89\u0ba3\u0bb0\u0bcd\u0bb5\u0bc1\u0baa\u0bcd \u0baa\u0bbe\u0b99\u0bcd\u0b95\u0bbf\u0bb2\u0bcd \u0ba8\u0b9f\u0ba8\u0bcd\u0ba4\u0bc1\u0b95\u0bca\u0bb3\u0bcd\u0bb3\u0bb2\u0bcd \u0bb5\u0bc7\u0ba3\u0bcd\u0b9f\u0bc1\u0bae\u0bcd.", + "text": "Tamil மனிதப௠பிறிவியினர௠à®à®•லரà¯à®®à¯ à®à¯à®¤à®¨à¯à®¤à®¿à®°à®®à®¾à®•வே பிறகà¯à®•ினà¯à®±à®©à®°à¯; அவரà¯à®•ள௠மதிபà¯à®ªà®¿à®²à¯à®®à¯, உரிமைகளிலà¯à®®à¯ à®à®®à®®à®¾à®©à®µà®°à¯à®•ளà¯, அவரà¯à®•ள௠நியாயதà¯à®¤à¯ˆà®¯à¯à®®à¯ மனà®à¯à®à®¾à®Ÿà¯à®à®¿à®¯à¯ˆà®¯à¯à®®à¯ இயறà¯à®ªà®£à¯à®ªà®¾à®•ப௠பெறà¯à®±à®µà®°à¯à®•ளà¯. அவரà¯à®•ள௠ஒரà¯à®µà®°à¯à®Ÿà®©à¯à®°à¯à®µà®°à¯ à®à®•ோதர உணரà¯à®µà¯à®ªà¯ பாஙà¯à®•ில௠நடநà¯à®¤à¯à®•à¯à®³à¯à®³à®²à¯ வேணà¯à®Ÿà¯à®®à¯.", "metadata": { "languages": [ "tam" @@ -9707,7 +9707,7 @@ { "type": "NarrativeText", "element_id": "cd3e1810510aee192781e40eae1b0ddc", - "text": "Tamil (Sri Lanka) \u0bae\u0ba9\u0bbf\u0ba4\u0baa\u0bcd \u0baa\u0bbf\u0bb1\u0bbf\u0bb5\u0bbf\u0baf\u0bbf\u0ba9\u0bb0\u0bcd \u0b9a\u0b95\u0bb2\u0bb0\u0bc1\u0bae\u0bcd \u0b9a\u0bc1\u0ba4\u0ba8\u0bcd\u0ba4\u0bbf\u0bb0\u0bae\u0bbe\u0b95\u0bb5\u0bc7 \u0baa\u0bbf\u0bb1\u0b95\u0bcd\u0b95\u0bbf\u0ba9\u0bcd\u0bb1\u0ba9\u0bb0\u0bcd; \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0bae\u0ba4\u0bbf\u0baa\u0bcd\u0baa\u0bbf\u0bb2\u0bc1\u0bae\u0bcd, \u0b89\u0bb0\u0bbf\u0bae\u0bc8\u0b95\u0bb3\u0bbf\u0bb2\u0bc1\u0bae\u0bcd \u0b9a\u0bae\u0bae\u0bbe\u0ba9\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd, \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0ba8\u0bbf\u0baf\u0bbe\u0baf\u0ba4\u0bcd\u0ba4\u0bc8\u0baf\u0bc1\u0bae\u0bcd \u0bae\u0ba9\u0b9a\u0bcd\u0b9a\u0bbe\u0b9f\u0bcd\u0b9a\u0bbf\u0baf\u0bc8\u0baf\u0bc1\u0bae\u0bcd \u0b87\u0baf\u0bb1\u0bcd\u0baa\u0ba3\u0bcd\u0baa\u0bbe\u0b95\u0baa\u0bcd \u0baa\u0bc6\u0bb1\u0bcd\u0bb1\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd. \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0b92\u0bb0\u0bc1\u0bb5\u0bb0\u0bc1\u0b9f\u0ba9\u0bca\u0bb0\u0bc1\u0bb5\u0bb0\u0bcd \u0b9a\u0b95\u0bcb\u0ba4\u0bb0 \u0b89\u0ba3\u0bb0\u0bcd\u0bb5\u0bc1\u0baa\u0bcd \u0baa\u0bbe\u0b99\u0bcd\u0b95\u0bbf\u0bb2\u0bcd \u0ba8\u0b9f\u0ba8\u0bcd\u0ba4\u0bc1\u0b95\u0bca\u0bb3\u0bcd\u0bb3\u0bb2\u0bcd \u0bb5\u0bc7\u0ba3\u0bcd\u0b9f\u0bc1\u0bae\u0bcd.", + "text": "Tamil (Sri Lanka) மனிதப௠பிறிவியினர௠à®à®•லரà¯à®®à¯ à®à¯à®¤à®¨à¯à®¤à®¿à®°à®®à®¾à®•வே பிறகà¯à®•ினà¯à®±à®©à®°à¯; அவரà¯à®•ள௠மதிபà¯à®ªà®¿à®²à¯à®®à¯, உரிமைகளிலà¯à®®à¯ à®à®®à®®à®¾à®©à®µà®°à¯à®•ளà¯, அவரà¯à®•ள௠நியாயதà¯à®¤à¯ˆà®¯à¯à®®à¯ மனà®à¯à®à®¾à®Ÿà¯à®à®¿à®¯à¯ˆà®¯à¯à®®à¯ இயறà¯à®ªà®£à¯à®ªà®¾à®•ப௠பெறà¯à®±à®µà®°à¯à®•ளà¯. அவரà¯à®•ள௠ஒரà¯à®µà®°à¯à®Ÿà®©à¯à®°à¯à®µà®°à¯ à®à®•ோதர உணரà¯à®µà¯à®ªà¯ பாஙà¯à®•ில௠நடநà¯à®¤à¯à®•à¯à®³à¯à®³à®²à¯ வேணà¯à®Ÿà¯à®®à¯.", "metadata": { "languages": [ "tam" @@ -9728,7 +9728,7 @@ { "type": "NarrativeText", "element_id": "9e55ede50aefd9018f64126e5d20a259", - "text": "Tatar \u0411\u0430\u0440\u043b\u044b\u043a \u043a\u0435\u0448\u0435\u043b\u04d9\u0440 \u0434\u04d9 \u0430\u0437\u0430\u0442 \u04bb\u04d9\u043c \u04af\u0437 \u0430\u0431\u0440\u0443\u0439\u043b\u0430\u0440\u044b \u04bb\u04d9\u043c \u0445\u043e\u043a\u0443\u043a\u043b\u0430\u0440\u044b \u044f\u0433\u044b\u043d\u043d\u0430\u043d \u0442\u0438\u04a3 \u0431\u0443\u043b\u044b\u043f \u0442\u0443\u0430\u043b\u0430\u0440. \u0410\u043b\u0430\u0440\u0433\u0430 \u0430\u043a\u044b\u043b \u04bb\u04d9\u043c \u0432\u04e9\u0497\u0434\u0430\u043d \u0431\u0438\u0440\u0435\u043b\u0433\u04d9\u043d \u04bb\u04d9\u043c \u0431\u0435\u0440-\u0431\u0435\u0440\u0441\u0435\u043d\u04d9 \u043a\u0430\u0440\u0430\u0442\u0430 \u0442\u0443\u0433\u0430\u043d\u0430\u0440\u0447\u0430 [\u0442\u0443\u0433\u0430\u043d\u043d\u0430\u0440\u0447\u0430] \u043c\u04e9\u043d\u04d9\u0441\u04d9\u0431\u04d9\u0442\u0442\u04d9 \u0431\u0443\u043b\u044b\u0440\u0433\u0430 \u0442\u0438\u0435\u0448\u043b\u04d9\u0440.", + "text": "Tatar Đ‘Đ°Ñ€Đ»Ñ‹Đº ĐºĐµÑˆĐµĐ»Ó™Ñ€ Đ´Ó™ Đ°Đ·Đ°Ñ‚ ̉»Ó™Đ¼ ̉¯Đ· Đ°Đ±Ñ€ÑƒĐ¹Đ»Đ°Ñ€Ñ‹ ̉»Ó™Đ¼ Ñ…Đ¾ĐºÑƒĐºĐ»Đ°Ñ€Ñ‹ ÑĐ³Ñ‹Đ½Đ½Đ°Đ½ Ñ‚Đ¸̉£ Đ±ÑƒĐ»Ñ‹Đ¿ Ñ‚ÑƒĐ°Đ»Đ°Ñ€. ĐĐ»Đ°Ñ€Đ³Đ° Đ°ĐºÑ‹Đ» ̉»Ó™Đ¼ Đ²Ó©̉—Đ´Đ°Đ½ Đ±Đ¸Ñ€ĐµĐ»Đ³Ó™Đ½ ̉»Ó™Đ¼ Đ±ĐµÑ€-Đ±ĐµÑ€ÑĐµĐ½Ó™ ĐºĐ°Ñ€Đ°Ñ‚Đ° Ñ‚ÑƒĐ³Đ°Đ½Đ°Ñ€Ñ‡Đ° [Ñ‚ÑƒĐ³Đ°Đ½Đ½Đ°Ñ€Ñ‡Đ°] Đ¼Ó©Đ½Ó™ÑÓ™Đ±Ó™Ñ‚Ñ‚Ó™ Đ±ÑƒĐ»Ñ‹Ñ€Đ³Đ° Ñ‚Đ¸ĐµÑˆĐ»Ó™Ñ€.", "metadata": { "languages": [ "rus" @@ -9749,7 +9749,7 @@ { "type": "NarrativeText", "element_id": "ca7b2ef61ad3e52b7b7873feb9ba85c1", - "text": "Telugu \u0c2a\u0c4d\u0c30\u0c24\u0c3f\u0c2a\u0c24\u0c4d\u0c24\u0c3f\u0c38\u0c4d\u0c35\u0c24\u0c4d\u0c35\u0c2e\u0c41\u0c32 \u0c35\u0c3f\u0c37\u0c2f\u0c2e\u0c41\u0c28 \u0c2e\u0c3e\u0c28\u0c35\u0c41\u0c32\u0c46\u0c32\u0c4d\u0c32\u0c30\u0c41\u0c28\u0c41 \u0c1c\u0c28\u0c4d\u0c2e\u0c24\u0c03 \u0c38\u0c4d\u0c35\u0c24\u0c02\u0c24\u0c4d\u0c30\u0c41\u0c32\u0c41\u0c28\u0c41 \u0c38\u0c2e\u0c3e\u0c28\u0c41\u0c32\u0c41\u0c28\u0c41 \u0c28\u0c17\u0c41\u0c26\u0c41\u0c30\u0c41. \u0c35\u0c3e\u0c30\u0c41 \u0c35\u0c3f\u0c35\u0c47\u0c1a\u0c28-\u0c05\u0c02\u0c24\u0c03\u0c15\u0c30\u0c23 \u0c38\u0c02\u0c2a\u0c28\u0c4d\u0c28\u0c41\u0c32\u0c17\u0c41\u0c1f\u0c1a\u0c47 \u0c2a\u0c30\u0c38\u0c4d\u0c2a\u0c30\u0c2e\u0c41 \u0c2d\u0c4d\u0c30\u0c3e\u0c24\u0c43\u0c2d\u0c3e\u0c35\u0c2e\u0c41\u0c24\u0c4b \u0c35\u0c30\u0c4d\u0c24\u0c3f\u0c02\u0c2a\u0c35\u0c32\u0c2f\u0c41\u0c28\u0c41.", + "text": "Telugu à°ªà±à°°à°¤à°¿à°ªà°¤à±à°¤à°¿à°¸à±à°µà°¤à±à°µà°®à±à°² విషయమà±à°¨ మానవà±à°²à±†à°²à±à°²à°°à±à°¨à± జనà±à°®à°¤à°ƒ à°¸à±à°µà°¤à°‚à°¤à±à°°à±à°²à±à°¨à± సమానà±à°²à±à°¨à± నగà±à°¦à±à°°à±. వారౠవివేà°à°¨-అంతఃకరణ సంపనà±à°¨à±à°²à°—à±à°Ÿà°à±‡ పరసà±à°ªà°°à°®à± à°­à±à°°à°¾à°¤à±ƒà°­à°¾à°µà°®à±à°¤à±‹ వరà±à°¤à°¿à°‚పవలయà±à°¨à±.", "metadata": { "languages": [ "tel" @@ -9770,7 +9770,7 @@ { "type": "NarrativeText", "element_id": "8947e9ec5ba76eabce3e2d1e59437be7", - "text": "Tem B\u00e1nl\u028ar\u028a\u0301\u028a \u0269r\u028a\u0301 b\u00e1a ween\u00ed na kez\u00e9\u0144b\u00ed\u00eddi g\u025b b\u0269ka b\u025bd\u025b\u0301\u025b \u0256\u0254\u0254z\u0269\u0301t\u0269 na y\u00edkow\u00e1 k\u025bg\u025b\u0301\u025b \u0256\u00e9y\u00ed-\u0256\u00e9y\u00ed g\u025b. B\u0254w\u025bn\u00e1 laak\u00e1r\u0269 na \u0269r\u028a\u0301t\u0269 b\u0269ka b\u0269\u0269b\u0254\u0301\u0254\u0301z\u0269 b\u0254c\u0254\u0254n\u00e1 \u0256am\u00e1 koob\u00edre c\u0254w\u028ar\u025b.", + "text": "Tem BĂ¡nlÊrỀÊ É©rỀ bĂ¡a weenĂ­ na kezĂ©Å„bĂ­Ă­di gÉ› bÉ©ka bÉ›dÉ›̀É› ɖɔɔzÉ©̀tÉ© na yĂ­kowĂ¡ kÉ›gÉ›̀É› É–Ă©yĂ­-É–Ă©yĂ­ gÉ›. BÉ”wÉ›nĂ¡ laakĂ¡rÉ© na É©rỀtÉ© bÉ©ka bɩɩbÉ”̀É”̀zÉ© bÉ”cɔɔnĂ¡ É–amĂ¡ koobĂ­re cÉ”wÊrÉ›.", "metadata": { "languages": [ "ces" @@ -9857,7 +9857,7 @@ { "type": "Title", "element_id": "70fb4fd148b0adc870bad4cf3a004e9e", - "text": "\u0e21\u0e19\u0e38\u0e29\u0e22\u0e4c\u0e17\u0e31\u0e49\u0e07\u0e2b\u0e25\u0e32\u0e22\u0e40\u0e01\u0e34\u0e14\u0e21\u0e32\u0e21\u0e35\u0e2d\u0e34\u0e2a\u0e23\u0e30\u0e41\u0e25\u0e30\u0e40\u0e2a\u0e21\u0e2d\u0e20\u0e32\u0e04\u0e01\u0e31\u0e19\u0e43\u0e19\u0e40\u0e01\u0e35\u0e22\u0e23\u0e15\u0e34\u0e28\u0e31\u0e01\u0e14[\u0e40\u0e01\u0e35\u0e22\u0e23\u0e15\u0e34\u0e28\u0e31\u0e01\u0e14\u0e34\u0e4c]\u0e41\u0e25\u0e30\u0e2a\u0e34\u0e17\u0e18\u0e34 \u0e15\u0e48\u0e32\u0e07\u0e21\u0e35\u0e40\u0e2b\u0e15\u0e38\u0e1c\u0e25\u0e41\u0e25\u0e30\u0e21\u0e42\u0e19\u0e18\u0e23\u0e23\u0e21 \u0e41\u0e25\u0e30\u0e04\u0e27\u0e23\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e15\u0e48\u0e2d\u0e01\u0e31\u0e19\u0e14\u0e49\u0e27\u0e22\u0e40\u0e08\u0e15\u0e19\u0e32\u0e23\u0e21\u0e13\u0e4c\u0e41\u0e2b\u0e48\u0e07\u0e20\u0e23\u0e32\u0e14\u0e23\u0e20\u0e32\u0e1e", + "text": "มนุษย์ทั้งหลายเà¸à¸´à¸”มามีอิสระà¹à¸¥à¸°à¹€à¸ªà¸¡à¸­à¸ à¸²à¸„à¸à¸±à¸™à¹ƒà¸™à¹€à¸à¸µà¸¢à¸£à¸•ิศัà¸à¸”[เà¸à¸µà¸¢à¸£à¸•ิศัà¸à¸”ิ์]à¹à¸¥à¸°à¸ªà¸´à¸—ธิ ต่างมีเหตุผลà¹à¸¥à¸°à¸¡à¹‚นธรรม à¹à¸¥à¸°à¸„วรปà¸à¸´à¸à¸±à¸•ิต่อà¸à¸±à¸™à¸”้วยเจตนารมณ์à¹à¸«à¹ˆà¸‡à¸ à¸£à¸²à¸”รภาà¸", "metadata": { "languages": [ "tha" @@ -9899,7 +9899,7 @@ { "type": "Title", "element_id": "a4b136507e5ed6666129c7a44794fd18", - "text": "\u0e21\u0e19\u0e38\u0e29\u0e22\u0e4c\u0e17\u0e31\u0e49\u0e07\u0e1b\u0e27\u0e07\u0e40\u0e01\u0e34\u0e14\u0e21\u0e32\u0e21\u0e35\u0e2d\u0e34\u0e2a\u0e23\u0e30\u0e41\u0e25\u0e30\u0e40\u0e2a\u0e21\u0e2d\u0e20\u0e32\u0e04\u0e01\u0e31\u0e19\u0e43\u0e19\u0e28\u0e31\u0e01\u0e14\u0e34\u0e4c\u0e28\u0e23\u0e35\u0e41\u0e25\u0e30\u0e2a\u0e34\u0e17\u0e18\u0e34 \u0e15\u0e48\u0e32\u0e07\u0e43\u0e19\u0e15\u0e19\u0e21\u0e35\u0e40\u0e2b\u0e15\u0e38\u0e1c\u0e25\u0e41\u0e25\u0e30\u0e21\u0e42\u0e19\u0e18\u0e23\u0e23\u0e21 \u0e41\u0e25\u0e30\u0e04\u0e27\u0e23\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e15\u0e48\u0e2d\u0e01\u0e31\u0e19\u0e14\u0e49\u0e27\u0e22\u0e08\u0e34\u0e15\u0e27\u0e34\u0e0d\u0e0d\u0e32\u0e13\u0e41\u0e2b\u0e48\u0e07\u0e20\u0e23\u0e32\u0e14\u0e23\u0e20\u0e32\u0e1e", + "text": "มนุษย์ทั้งปวงเà¸à¸´à¸”มามีอิสระà¹à¸¥à¸°à¹€à¸ªà¸¡à¸­à¸ à¸²à¸„à¸à¸±à¸™à¹ƒà¸™à¸¨à¸±à¸à¸”ิ์ศรีà¹à¸¥à¸°à¸ªà¸´à¸—ธิ ต่างในตนมีเหตุผลà¹à¸¥à¸°à¸¡à¹‚นธรรม à¹à¸¥à¸°à¸„วรปà¸à¸´à¸à¸±à¸•ิต่อà¸à¸±à¸™à¸”้วยจิตวิà¸à¸à¸²à¸“à¹à¸«à¹ˆà¸‡à¸ à¸£à¸²à¸”รภาà¸", "metadata": { "languages": [ "tha" @@ -9920,7 +9920,7 @@ { "type": "NarrativeText", "element_id": "8f52798dd21c8472bda701088f7e82ca", - "text": "Themne A kom a\u014bf\u0259m ak\u0259pet b\u025b \u014ba ath\u0259n\u028cn\u025b yi r\u028cwankom. \u0186wa a\u014b ba m\u0259mari m\u0259th\u0259n\u028cn\u025b. \u0186wa a\u014b ba m\u0259fith yi t\u0259chemp. Chiya\u014b, a\u014b yi t\u0259k\u0259 gbasi a\u014bkos \u014ba\u014b m\u0254 k\u0259pa \u014ba t\u0259kom.", + "text": "Themne A kom aÅ‹fÉ™m akÉ™pet bÉ› Å‹a athÉ™nÊŒnÉ› yi rÊŒwankom. Ɔwa aÅ‹ ba mÉ™mari mÉ™thÉ™nÊŒnÉ›. Ɔwa aÅ‹ ba mÉ™fith yi tÉ™chemp. ChiyaÅ‹, aÅ‹ yi tÉ™kÉ™ gbasi aÅ‹kos Å‹aÅ‹ mÉ” kÉ™pa Å‹a tÉ™kom.", "metadata": { "languages": [ "swa", @@ -9963,7 +9963,7 @@ { "type": "Title", "element_id": "9ff7c25da02c27eefccdaca502af53c1", - "text": "\u0f60\u0f42\u0fb2\u0f7c\u0f0b\u0f56\u0f0b\u0f58\u0f72\u0f60\u0f72\u0f0b\u0f62\u0f72\u0f42\u0f66\u0f0b\u0f62\u0f92\u0fb1\u0f74\u0f51\u0f0b\u0f61\u0f7c\u0f44\u0f66\u0f0b\u0f63\u0f0b\u0f66\u0f90\u0fb1\u0f7a\u0f66\u0f0b\u0f59\u0f58\u0f0b\u0f49\u0f72\u0f51\u0f0b\u0f53\u0f66\u0f0b\u0f46\u0f7a\u0f0b\u0f58\u0f50\u0f7c\u0f44\u0f66\u0f0b\u0f51\u0f44\u0f0c\u0f0d \u0f50\u0f7c\u0f56\u0f0b\u0f50\u0f44\u0f42\u0f72\u0f0b\u0f62\u0f44\u0f0b\u0f51\u0f56\u0f44\u0f0b\u0f60\u0f51\u0fb2\u0f0b\u0f58\u0f49\u0f58\u0f0b\u0f51\u0f74\u0f0b\u0f61\u0f7c\u0f51\u0f0b\u0f63\u0f0d \u0f41\u0f7c\u0f44\u0f0b\u0f5a\u0f7c\u0f62\u0f0b\u0f62\u0f44\u0f0b\u0f56\u0fb1\u0f74\u0f44\u0f0b\u0f42\u0f72\u0f0b\u0f56\u0fb3\u0f7c\u0f0b\u0f62\u0fa9\u0f63\u0f0b\u0f51\u0f44\u0f0b\u0f56\u0f66\u0f58\u0f0b\u0f5a\u0f74\u0f63\u0f0b\u0f56\u0f5f\u0f44\u0f0b\u0f54\u0f7c\u0f0b\u0f60\u0f51\u0f7c\u0f53\u0f0b\u0f54\u0f60\u0f72\u0f0b\u0f60\u0f7c\u0f66\u0f0b\u0f56\u0f56\u0f66\u0f0b\u0f40\u0fb1\u0f44\u0f0b\u0f61\u0f7c\u0f51\u0f0d \u0f51\u0f7a\u0f0b\u0f56\u0f5e\u0f72\u0f53\u0f0b\u0f55\u0f53\u0f0b\u0f5a\u0f74\u0f53\u0f0b\u0f42\u0f45\u0f72\u0f42\u0f0b\u0f42\u0f72\u0f66\u0f0b\u0f42\u0f45\u0f72\u0f42\u0f0b\u0f63\u0f0b\u0f56\u0f74\u0f0b\u0f66\u0fa4\u0f74\u0f53\u0f0b\u0f42\u0fb1\u0f72\u0f0b\u0f60\u0f51\u0f74\u0f0b\u0f64\u0f7a\u0f66\u0f0b\u0f60\u0f5b\u0f72\u0f53\u0f0b\u0f54\u0f60\u0f72\u0f0b\u0f56\u0fb1\u0f0b\u0f66\u0fa4\u0fb1\u0f7c\u0f51\u0f0b\u0f40\u0fb1\u0f44\u0f0b\u0f63\u0f42\u0f0b\u0f63\u0f7a\u0f53\u0f0b\u0f56\u0f66\u0f9f\u0f62\u0f0b\u0f51\u0f42\u0f7c\u0f66\u0f0b\u0f54\u0f0b\u0f61\u0f72\u0f53\u0f0e", + "text": "འགྲོ་བ་མིའི་རིགས་རྒྱུད་ཡོངས་ལ་སà¾à¾±à½ºà½¦à¼‹à½™à½˜à¼‹à½‰à½²à½‘་ནས་ཆེ་མà½à½¼à½„ས་དང༌༠à½à½¼à½–་à½à½„གི་རང་དབང་འདྲ་མཉམ་དུ་ཡོད་ལ༠à½à½¼à½„་à½à½¼à½¢à¼‹à½¢à½„་བྱུང་གི་བློ་རྩལ་དང་བསམ་à½à½´à½£à¼‹à½–ཟང་པོ་འདོན་པའི་འོས་བབས་ཀྱང་ཡོད༠དེ་བà½à½²à½“་ཕན་à½à½´à½“་གཅིག་གིས་གཅིག་ལ་བུ་སྤུན་གྱི་འདུ་ཤེས་འཛིན་པའི་བྱ་སྤྱོད་ཀྱང་ལག་ལེན་བསྟར་དགོས་པ་ཡིནà¼", "metadata": { "filetype": "text/plain", "data_source": { @@ -9981,7 +9981,7 @@ { "type": "NarrativeText", "element_id": "8af88623529d7fac1f9e181cf1759b64", - "text": "Ticuna Ng\u1ebdxguma nabuxgu i du\u00fc\u0303x\u00fc\u0303g\u00fc r\u00fc gux\u00fc\u0303ma naw\u00fcxigu, r\u00fc tataxuma ya tex\u00e9 ya tog\u00fcar\u00fc yexera ix\u0129s\u1ebd. R\u00fc gux\u00fc\u0303ma nax\u00e3\u00e3\u1ebdg\u00fc r\u00fc ng\u1ebdmaca\u0331x r\u00fc name nix\u0129 na n\u00fcg\u00fcma\u00e3 namec\u00fcmax\u00fc\u0303 \u0129 gux\u00fc\u0303ma \u0129 du\u00fc\u0303x\u00fc\u0303g\u00fc.", + "text": "Ticuna Ngẽxguma nabuxgu i duĂ¼̀ƒxĂ¼̀ƒgĂ¼ rĂ¼ guxĂ¼̀ƒma nawĂ¼xigu, rĂ¼ tataxuma ya texĂ© ya togĂ¼arĂ¼ yexera ixÄ©sẽ. RĂ¼ guxĂ¼̀ƒma naxĂ£Ă£áº½gĂ¼ rĂ¼ ngẽmacà±x rĂ¼ name nixÄ© na nĂ¼gĂ¼maĂ£ namecĂ¼maxĂ¼̀ƒ Ä© guxĂ¼̀ƒma Ä© duĂ¼̀ƒxĂ¼̀ƒgĂ¼.", "metadata": { "languages": [ "tur", @@ -10005,7 +10005,7 @@ { "type": "UncategorizedText", "element_id": "3a1e54e52c1e8f2960b9f52ba81d5b61", - "text": "Tigrigna \u1265\u1218\u1295\u1345\u122d \u12ad\u1265\u122d\u1295 \u1218\u1230\u120d\u1295 \u12a9\u120e\u121d \u1230\u1263\u1275 \u12a5\u1295\u1275\u12cd\u1208\u12f1 \u1290\u1343\u1295 \u121b\u12d5\u122a\u1295 \u12a5\u12ee\u121d\u1361\u1361 \u121d\u1235\u1275\u12cd\u12d3\u120d\u1295 \u1215\u120d\u1293\u1295 \u12dd\u1270\u12d3\u12f0\u120e\u121d \u1265\u121d\u12c3\u1296\u121d \u1295\u1215\u12f5\u1215\u12f6\u121d \u1265\u1215\u12cd\u1290\u1273\u12ca \u1218\u1295\u1348\u1235 \u12ad\u1270\u1213\u120b\u1208\u12e9 \u12a6\u1208\u12ce\u121d\u1361\u1361", + "text": "Tigrigna ብመá•á…ር á­á‰¥áˆ­á• መሰáˆá• á©áˆáˆ ሰባት á¥á•ትá‹áˆˆá‹± ááƒá• ማዕሪᕠá¥á‹®áˆá¡á¡ áˆáˆµá‰µá‹á‹“áˆá• ሕáˆá“á• á‹á‰°á‹“á‹°áˆáˆ ብáˆá‹ƒá–ሠá•ሕድሕዶሠብሕá‹áታዠመá•áˆáˆµ á­á‰°áˆ“ላለዩ á¦áˆˆá‹áˆá¡á¡", "metadata": { "filetype": "text/plain", "data_source": { @@ -10045,7 +10045,7 @@ { "type": "NarrativeText", "element_id": "dce66eb1491ee0e05782cd7b4060bdf1", - "text": "Toba 'Enauac na naaxat shi\u1ef9axauapi na mayipi huesochiguii qataq 'eeta'a't da l'amaqchic qataq da 'enec qataq \u1ef9ataqta \u1ef9a\u1ef9ate'n naua lataxaco qataq nua no'o'n nvil\u1ef9axaco, qaq \u1ef9oqo'oyi iuen da i 'oonolec \u1ef9ataqta itauan ichoxoden ca l\u1ef9a", + "text": "Toba 'Enauac na naaxat shiỹaxauapi na mayipi huesochiguii qataq 'eeta'a't da l'amaqchic qataq da 'enec qataq ỹataqta ỹaỹate'n naua lataxaco qataq nua no'o'n nvilỹaxaco, qaq ỹoqo'oyi iuen da i 'oonolec ỹataqta itauan ichoxoden ca lỹa", "metadata": { "languages": [ "som", @@ -10067,7 +10067,7 @@ { "type": "NarrativeText", "element_id": "d4b675c94f0bd52682c828f5060488a5", - "text": "Tojolabal Spetsanal ja swinkil ja lu\u2019um k\u2019inali junxta wax jul schonjel, sok ja sijpanub\u2019ali, ja yuj ojni b\u2019ob\u2019 sk\u2019u\u2019luk ja jas sk\u2019ana-i ja b\u2019as lekilali, ja yuj ja ay sk\u2019ujoli sok ay spensari t\u2019ilan oj yilsb\u2019aje lek sok ja smoj jumasa.", + "text": "Tojolabal Spetsanal ja swinkil ja lu’um k’inali junxta wax jul schonjel, sok ja sijpanub’ali, ja yuj ojni b’ob’ sk’u’luk ja jas sk’ana-i ja b’as lekilali, ja yuj ja ay sk’ujoli sok ay spensari t’ilan oj yilsb’aje lek sok ja smoj jumasa.", "metadata": { "languages": [ "slv", @@ -10135,7 +10135,7 @@ { "type": "NarrativeText", "element_id": "11c1506a0e4eb0a3616787ebc32828da", - "text": "Tongan Ko e kotoa \u2018o ha\u2019a tangata \u2018oku fanau\u2019i mai \u2018oku tau\u2019ataina pea tatau \u2018i he ngeia mo e ngaahi totonu. Na\u2019e fakanaunau\u2019i kinautolu \u2018aki \u2018a e \u2018atamai mo e konisenisi pea \u2018oku totonu ke nau feohi \u2018i he laumalie \u2018o e nofo fakatautehina.", + "text": "Tongan Ko e kotoa ‘o ha’a tangata ‘oku fanau’i mai ‘oku tau’ataina pea tatau ‘i he ngeia mo e ngaahi totonu. Na’e fakanaunau’i kinautolu ‘aki ‘a e ‘atamai mo e konisenisi pea ‘oku totonu ke nau feohi ‘i he laumalie ‘o e nofo fakatautehina.", "metadata": { "languages": [ "swa", @@ -10180,7 +10180,7 @@ { "type": "NarrativeText", "element_id": "49ac7c418a1a33c64e2c3e228669acea", - "text": "Tsonga (Mozambique) Vanhu hin'kwavu va psaliwili na va khululek\u00ecle, funthsi va fana hi lisima ni tinfaneno. V\u00e0 psaliwili ni nyiko ya ku pimisa ni ku yehleketa; hi kolahu, va fanela ku hanya hi moya wa umb\u00eclu ni unghani.", + "text": "Tsonga (Mozambique) Vanhu hin'kwavu va psaliwili na va khululekìle, funthsi va fana hi lisima ni tinfaneno. VĂ  psaliwili ni nyiko ya ku pimisa ni ku yehleketa; hi kolahu, va fanela ku hanya hi moya wa umbìlu ni unghani.", "metadata": { "languages": [ "swa" @@ -10245,7 +10245,7 @@ { "type": "NarrativeText", "element_id": "3ecfed863a5eed35ac7bcdc4f1ebcf6d", - "text": "Turkish B\u00fct\u00fcn insanlar h\u00fcr, haysiyet ve haklar bak\u0131m\u0131ndan e\u015fit do\u011farlar. Ak\u0131l ve vicdana sahiptirler ve birbirlerine kar\u015f\u0131 karde\u015flik zihniyeti ile hareket etmelidirler.", + "text": "Turkish BĂ¼tĂ¼n insanlar hĂ¼r, haysiyet ve haklar bakımından eÅŸit doÄŸarlar. Akıl ve vicdana sahiptirler ve birbirlerine karşı kardeÅŸlik zihniyeti ile hareket etmelidirler.", "metadata": { "languages": [ "tur" @@ -10266,7 +10266,7 @@ { "type": "NarrativeText", "element_id": "ec6b4429d4b16c9725f0f1420314a928", - "text": "Turkmen (Cyrillic) \u0425\u0435\u043c\u043c\u0435 \u0430\u0434\u0430\u043c\u043b\u0430\u0440 \u04e9\u0437 \u043c\u0435\u0440\u0442\u0435\u0431\u0435\u0441\u0438 \u0432\u0435 \u0445\u0443\u043a\u0443\u043a\u043b\u0430\u0440\u044b \u0431\u043e\u044e\u043d\u0447\u0430 \u0434\u0435\u04a3 \u044f\u0433\u0434\u0430\u0439\u0434\u0430 \u0434\u04af\u043d\u0439\u04d9 \u0438\u043d\u0439\u04d9\u0440\u043b\u0435\u0440. \u041e\u043b\u0430\u0440\u0430 \u0430\u04a3 \u0445\u0435\u043c \u0432\u044b\u0497\u0434\u0430\u043d \u0431\u0435\u0440\u043b\u0435\u043d\u0434\u0438\u0440 \u0432\u0435 \u043e\u043b\u0430\u0440 \u0431\u0438\u0440\u2010\u0431\u0438\u0440\u043b\u0435\u0440\u0438 \u0431\u0438\u043b\u0435\u043d \u0434\u043e\u0433\u0430\u043d\u043b\u044b\u043a \u0440\u0443\u0445\u0443\u043d\u0434\u0430\u043a\u044b \u0433\u0430\u0440\u0430\u0439\u044b\u0448\u0434\u0430 \u0431\u043e\u043b\u043c\u0430\u043b\u044b\u0434\u044b\u0440\u043b\u0430\u0440.", + "text": "Turkmen (Cyrillic) Đ¥ĐµĐ¼Đ¼Đµ Đ°Đ´Đ°Đ¼Đ»Đ°Ñ€ Ó©Đ· Đ¼ĐµÑ€Ñ‚ĐµĐ±ĐµÑи Đ²Đµ Ñ…ÑƒĐºÑƒĐºĐ»Đ°Ñ€Ñ‹ Đ±Đ¾ÑĐ½Ñ‡Đ° де̉£ ÑĐ³Đ´Đ°Đ¹Đ´Đ° Đ´̉¯Đ½Đ¹Ó™ Đ¸Đ½Đ¹Ó™Ñ€Đ»ĐµÑ€. ĐĐ»Đ°Ñ€Đ° а̉£ Ñ…ĐµĐ¼ Đ²Ñ‹̉—Đ´Đ°Đ½ Đ±ĐµÑ€Đ»ĐµĐ½Đ´Đ¸Ñ€ Đ²Đµ Đ¾Đ»Đ°Ñ€ Đ±Đ¸Ñ€â€Đ±Đ¸Ñ€Đ»ĐµÑ€Đ¸ Đ±Đ¸Đ»ĐµĐ½ Đ´Đ¾Đ³Đ°Đ½Đ»Ñ‹Đº Ñ€ÑƒÑ…ÑƒĐ½Đ´Đ°ĐºÑ‹ Đ³Đ°Ñ€Đ°Đ¹Ñ‹ÑˆĐ´Đ° Đ±Đ¾Đ»Đ¼Đ°Đ»Ñ‹Đ´Ñ‹Ñ€Đ»Đ°Ñ€.", "metadata": { "languages": [ "rus" @@ -10287,7 +10287,7 @@ { "type": "NarrativeText", "element_id": "27683edb29bca811bea3008052c0fc9f", - "text": "Turkmen (Latin) Adamlary\u0148 hemmesi azat dogul\u00fdarlar we \u00f6z mertebesi hem\u2010de hukuklary bo\u00fdun\u00e7a ilkiba\u015fdan de\u0148dirler. Olara ozal\u2010ba\u015fdan a\u0148, ynsap berlendir we biri\u2010birine \u00f6zara doganlyk ruhunda \u00e7emele\u015fmek olary\u0148 \u00fdara\u015fygydyr.", + "text": "Turkmen (Latin) Adamlaryň hemmesi azat dogulĂ½arlar we öz mertebesi hemâ€de hukuklary boĂ½unça ilkibaÅŸdan deňdirler. Olara ozalâ€baÅŸdan aň, ynsap berlendir we biriâ€birine özara doganlyk ruhunda çemeleÅŸmek olaryň Ă½araÅŸygydyr.", "metadata": { "languages": [ "tur" @@ -10308,7 +10308,7 @@ { "type": "NarrativeText", "element_id": "6b9f05c9e0fdf0e6de36b54f1c82f5d0", - "text": "Tuva \u0411\u04af\u0433\u04af \u043a\u0438\u0436\u0438\u043b\u0435\u0440 \u0445\u043e\u0441\u0442\u0443\u0433 \u0431\u0430\u0437\u0430 \u043c\u04e9\u0437\u04af\u0437\u04af \u0431\u043e\u043b\u0433\u0430\u0448 \u044d\u0440\u0433\u0435\u043b\u0435\u0440\u0438 \u0434\u0435\u04a3 \u043a\u044b\u043b\u0434\u044b\u0440 \u0442\u04e9\u0440\u04af\u0442\u0442\u04af\u043d\u0435\u0440. \u041e\u043b\u0430\u0440\u0433\u0430 \u0443\u0433\u0430\u0430\u043d\u0441\u0430\u0440\u044b\u044b\u043b \u0431\u043e\u043b\u0433\u0430\u0448 \u0430\u0440\u044b\u043d-\u043d\u04af\u04af\u0440 \u0431\u0435\u0440\u0434\u0438\u043d\u0433\u0435\u043d \u0431\u043e\u043b\u0443\u0440 \u0431\u043e\u043b\u0433\u0430\u0448 \u043e\u043b\u0430\u0440 \u0431\u043e\u0442-\u0431\u043e\u0442\u0442\u0430\u0440\u044b\u043d\u0433\u0430 \u0430\u043a\u044b-\u0434\u0443\u04a3\u043c\u0430\u043b\u044b\u0448\u043a\u044b \u0445\u0430\u043c\u0430\u0430\u0440\u044b\u043b\u0433\u0430\u043d\u044b \u043a\u04e9\u0440\u0433\u04af\u0437\u0435\u0440 \u0443\u0436\u0443\u0440\u043b\u0443\u0433.", + "text": "Tuva Đ‘̉¯Đ³̉¯ ĐºĐ¸Đ¶Đ¸Đ»ĐµÑ€ Ñ…Đ¾ÑÑ‚ÑƒĐ³ база Đ¼Ó©Đ·̉¯Đ·̉¯ Đ±Đ¾Đ»Đ³Đ°Ñˆ ÑÑ€Đ³ĐµĐ»ĐµÑ€Đ¸ де̉£ ĐºÑ‹Đ»Đ´Ñ‹Ñ€ төр̉¯Ñ‚Ñ‚̉¯Đ½ĐµÑ€. ĐĐ»Đ°Ñ€Đ³Đ° ÑƒĐ³Đ°Đ°Đ½ÑĐ°Ñ€Ñ‹Ñ‹Đ» Đ±Đ¾Đ»Đ³Đ°Ñˆ Đ°Ñ€Ñ‹Đ½-Đ½̉¯̉¯Ñ€ Đ±ĐµÑ€Đ´Đ¸Đ½Đ³ĐµĐ½ Đ±Đ¾Đ»ÑƒÑ€ Đ±Đ¾Đ»Đ³Đ°Ñˆ Đ¾Đ»Đ°Ñ€ Đ±Đ¾Ñ‚-Đ±Đ¾Ñ‚Ñ‚Đ°Ñ€Ñ‹Đ½Đ³Đ° Đ°ĐºÑ‹-Đ´Ñƒ̉£Đ¼Đ°Đ»Ñ‹ÑˆĐºÑ‹ Ñ…Đ°Đ¼Đ°Đ°Ñ€Ñ‹Đ»Đ³Đ°Đ½Ñ‹ ĐºÓ©Ñ€Đ³̉¯Đ·ĐµÑ€ ÑƒĐ¶ÑƒÑ€Đ»ÑƒĐ³.", "metadata": { "languages": [ "rus" @@ -10329,7 +10329,7 @@ { "type": "NarrativeText", "element_id": "527f7d8b2d19b7c6c3f2fadc70ada262", - "text": "Twi (Akuapem) W\u0254awo adesamma nyinaa s\u025b nnipa a w\u0254w\u0254 ahofadi. W\u0254n nyinaa w\u0254 nidi ne ky\u025bfa koro. W\u0254w\u0254 adwene ne ahonim, na \u025bs\u025b s\u025b wobu w\u0254n ho w\u0254n ho s\u025b anuanom.", + "text": "Twi (Akuapem) WÉ”awo adesamma nyinaa sÉ› nnipa a wÉ”wÉ” ahofadi. WÉ”n nyinaa wÉ” nidi ne kyÉ›fa koro. WÉ”wÉ” adwene ne ahonim, na É›sÉ› sÉ› wobu wÉ”n ho wÉ”n ho sÉ› anuanom.", "metadata": { "languages": [ "swa", @@ -10351,7 +10351,7 @@ { "type": "NarrativeText", "element_id": "aefbdde1da2ecc73208751b3c330bb3e", - "text": "Twi (Asante) Nnipa nyinaa y\u025b p\u025b. Na w\u0254de adwene ne nyansa na ab\u0254 obiara. \u0190no nti, \u025bs\u025b s\u025b obiara d\u0254 ne y\u0254nko, bu ne y\u0254nko, di ne y\u0254nko ni.", + "text": "Twi (Asante) Nnipa nyinaa yÉ› pÉ›. Na wÉ”de adwene ne nyansa na abÉ” obiara. Æno nti, É›sÉ› sÉ› obiara dÉ” ne yÉ”nko, bu ne yÉ”nko, di ne yÉ”nko ni.", "metadata": { "languages": [ "swa", @@ -10373,7 +10373,7 @@ { "type": "NarrativeText", "element_id": "4b0bd8eaae3f12feed9188c010027eb7", - "text": "Tzeltal, Oxchuc Spisil winiketik te ya xbejk\u00b4ajik ta k\u00b4inalil ay jrerechotik, mayuk mach\u00b4a chukul ya xbejka, ya jnatik stojol te jpisiltik ay snopibal sok sbijil joltik, ja\u00b4 me k\u00b4ux ya kaibatik ta jujun tul.", + "text": "Tzeltal, Oxchuc Spisil winiketik te ya xbejk´ajik ta k´inalil ay jrerechotik, mayuk mach´a chukul ya xbejka, ya jnatik stojol te jpisiltik ay snopibal sok sbijil joltik, ja´ me k´ux ya kaibatik ta jujun tul.", "metadata": { "languages": [ "ind", @@ -10395,7 +10395,7 @@ { "type": "NarrativeText", "element_id": "3a1d8b7b6302ae4de3c1c05a5c4f8fc7", - "text": "Tzotzil (Chamula) Skotol vinik o ants ta spejel balumile k\u2019olem x-hayan i ko\u2019ol ta sch\u2019ulal i sderechoetik i, skotol k\u2019ux-elan oyike oy srasonik y slekilalik, sventa skuxijik lekn\u00f3o ta ju jun ju ju vo.", + "text": "Tzotzil (Chamula) Skotol vinik o ants ta spejel balumile k’olem x-hayan i ko’ol ta sch’ulal i sderechoetik i, skotol k’ux-elan oyike oy srasonik y slekilalik, sventa skuxijik leknĂ³o ta ju jun ju ju vo.", "metadata": { "languages": [ "hrv", @@ -10418,7 +10418,7 @@ { "type": "NarrativeText", "element_id": "9c8ce1a1d4b031909f2b8d5c31bc3084", - "text": "Uduk Aris \u2019kwaniny\u2019ceshi \u2019baar mo dho\u2019thkunu \u2019ba\u1e35any mo dhali mmomiiya \u1e6fu\u2019c imon\u1e6fal \u2019de/ mo dhali mii ma \u1e35ar/e mo. Uni mini ta gi gwo mo dhali mii mo dhali uni mini mii ka karambuye/ \u2019kup\u0331 ki cin tiya mo e shi/in mo dhali mii kun tanu ikam mo.", + "text": "Uduk Aris ’kwaniny’ceshi ’baar mo dho’thkunu ’baḵany mo dhali mmomiiya ṯu’c imonṯal ’de/ mo dhali mii ma ḵar/e mo. Uni mini ta gi gwo mo dhali mii mo dhali uni mini mii ka karambuye/ ’kup̀± ki cin tiya mo e shi/in mo dhali mii kun tanu ikam mo.", "metadata": { "languages": [ "swa", @@ -10440,7 +10440,7 @@ { "type": "NarrativeText", "element_id": "35ad852b028b17863397cd23a741e776", - "text": "Ukrainian \u0412\u0441\u0456 \u043b\u044e\u0434\u0438 \u043d\u0430\u0440\u043e\u0434\u0436\u0443\u044e\u0442\u044c\u0441\u044f \u0432\u0456\u043b\u044c\u043d\u0438\u043c\u0438 \u0456 \u0440\u0456\u0432\u043d\u0438\u043c\u0438 \u0443 \u0441\u0432\u043e\u0457\u0439 \u0433\u0456\u0434\u043d\u043e\u0441\u0442\u0456 \u0442\u0430 \u043f\u0440\u0430\u0432\u0430\u0445. \u0412\u043e\u043d\u0438 \u043d\u0430\u0434\u0456\u043b\u0435\u043d\u0456 \u0440\u043e\u0437\u0443\u043c\u043e\u043c \u0456 \u0441\u043e\u0432\u0456\u0441\u0442\u044e \u0456 \u043f\u043e\u0432\u0438\u043d\u043d\u0456 \u0434\u0456\u044f\u0442\u0438 \u0443 \u0432\u0456\u0434\u043d\u043e\u0448\u0435\u043d\u043d\u0456 \u043e\u0434\u0438\u043d \u0434\u043e \u043e\u0434\u043d\u043e\u0433\u043e \u0432 \u0434\u0443\u0441\u0456 \u0431\u0440\u0430\u0442\u0435\u0440\u0441\u0442\u0432\u0430.", + "text": "Ukrainian Đ’ÑÑ– Đ»Ñди Đ½Đ°Ñ€Đ¾Đ´Đ¶ÑƒÑтьÑÑ Đ²Ñ–Đ»ÑŒĐ½Đ¸Đ¼Đ¸ Ñ– Ñ€Ñ–Đ²Đ½Đ¸Đ¼Đ¸ у ÑĐ²Đ¾Ñ—Đ¹ Đ³Ñ–Đ´Đ½Đ¾Ñті Ñ‚Đ° Đ¿Ñ€Đ°Đ²Đ°Ñ…. Đ’Đ¾Đ½Đ¸ Đ½Đ°Đ´Ñ–Đ»ĐµĐ½Ñ– Ñ€Đ¾Đ·ÑƒĐ¼Đ¾Đ¼ Ñ– ÑĐ¾Đ²Ñ–ÑÑ‚Ñ Ñ– Đ¿Đ¾Đ²Đ¸Đ½Đ½Ñ– Đ´Ñ–ÑÑ‚Đ¸ у Đ²Ñ–Đ´Đ½Đ¾ÑˆĐµĐ½Đ½Ñ– Đ¾Đ´Đ¸Đ½ Đ´Đ¾ Đ¾Đ´Đ½Đ¾Đ³Đ¾ Đ² Đ´ÑƒÑÑ– Đ±Ñ€Đ°Ñ‚ĐµÑ€ÑÑ‚Đ²Đ°.", "metadata": { "languages": [ "ukr" @@ -10461,7 +10461,7 @@ { "type": "NarrativeText", "element_id": "2da70f2c0e7850d3cb64606cb0479fc9", - "text": "Umbundu Omanu vosi vacitiwa valipwa kwenda valisoka kovina vyosikwenda komoko. Ovo vakwete esunga kwenda, kwenda olondunge kwenje ovo vat\u00eala okuliteywila kuvamwe kwenda vakwavo vesokolwilo lyocisola.", + "text": "Umbundu Omanu vosi vacitiwa valipwa kwenda valisoka kovina vyosikwenda komoko. Ovo vakwete esunga kwenda, kwenda olondunge kwenje ovo vatĂªla okuliteywila kuvamwe kwenda vakwavo vesokolwilo lyocisola.", "metadata": { "languages": [ "swa", @@ -10529,7 +10529,7 @@ { "type": "UncategorizedText", "element_id": "17e2b5b5c80c984c98843bbed39884c4", - "text": "Urdu \u062a\u0645\u0627\u0645 \u0627\u0646\u0633\u0627\u0646 \u0622\u0632\u0627\u062f \u0627\u0648\u0631 \u062d\u0642\u0648\u0642 \u0648 \u0639\u0632\u062a \u06a9\u06d2 \u0627\u0639\u062a\u0628\u0627\u0631 \u0633\u06d2 \u0628\u0631\u0627\u0628\u0631 \u067e\u06cc\u062f\u0627 \u06c1\u0648\u0626\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0627\u0646\u06c1\u06cc\u06ba \u0636\u0645\u06cc\u0631 \u0627\u0648\u0631 \u0639\u0642\u0644 \u0648\u062f\u06cc\u0639\u062a \u06c1\u0648\u0626\u06cc \u06c1\u06d2\u06d4 \u0627\u0633 \u0644\u0626\u06d2 \u0627\u0646\u06c1\u06cc\u06ba \u0627\u06cc\u06a9 \u062f\u0648\u0633\u0631\u06d2 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0628\u06be\u0627\u0626\u06cc \u0686\u0627\u0631\u06d2 \u06a9\u0627 \u0633\u0644\u0648\u06a9 \u06a9\u0631\u0646\u0627 \u0686\u0627\u06c1\u06cc\u0626\u06d2\u06d4", + "text": "Urdu تمام انسان آزاد اور حقوق Ùˆ عزت Ú©Û’ اعتبار سے برابر پیدا Ûوئے Ûیں۔ انÛیں ضمیر اور عقل ودیعت Ûوئی ÛÛ’Û” اس لئے انÛیں ایک دوسرے Ú©Û’ ساتھ بھائی چارے کا سلوک کرنا چاÛیئے۔", "metadata": { "languages": [ "urd" @@ -10550,7 +10550,7 @@ { "type": "UncategorizedText", "element_id": "64062747e4a49e81a0ff7fe76c935f92", - "text": "Urdu (2) \u062a\u0645\u0627\u0645 \u0627\u0646\u0633\u0627\u0646 \u0622\u0632\u0627\u062f \u0627\u0648\u0631 \u062d\u0642\u0648\u0642 \u0648 \u0639\u0632\u062a \u06a9\u06d2 \u0627\u0639\u062a\u0628\u0627\u0631 \u0633\u06d2 \u0628\u0631\u0627\u0628\u0631 \u067e\u06cc\u062f\u0627 \u06c1\u0648\u0626\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0627\u0646\u06c1\u06cc\u06ba \u0636\u0645\u06cc\u0631 \u0627\u0648\u0631 \u0639\u0642\u0644 \u0648\u062f\u06cc\u0639\u062a \u06c1\u0648\u0626\u06cc \u06c1\u06d2\u06d4 \u0627\u0633 \u0644\u06cc\u06d2 \u0627\u0646\u06c1\u06cc\u06ba \u0627\u06cc\u06a9 \u062f\u0648\u0633\u0631\u06d2 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0628\u06be\u0627\u0626\u06cc \u0686\u0627\u0631\u06d2 \u06a9\u0627 \u0633\u0644\u0648\u06a9 \u06a9\u0631\u0646\u0627 \u0686\u0627\u06c1\u06cc\u06d2\u06d4", + "text": "Urdu (2) تمام انسان آزاد اور حقوق Ùˆ عزت Ú©Û’ اعتبار سے برابر پیدا Ûوئے Ûیں۔ انÛیں ضمیر اور عقل ودیعت Ûوئی ÛÛ’Û” اس لیے انÛیں ایک دوسرے Ú©Û’ ساتھ بھائی چارے کا سلوک کرنا چاÛیے۔", "metadata": { "languages": [ "urd" @@ -10571,7 +10571,7 @@ { "type": "NarrativeText", "element_id": "c0f369076ccc7b4f6949b46f78e9c721", - "text": "Uyghur (Arabic) \u06be\u06d5\u0645\u0645\u06d5 \u0626\u0627\u062f\u06d5\u0645 \u0632\u0627\u0646\u0649\u062f\u0649\u0646\u0644\u0627 \u0626\u06d5\u0631\u0643\u0649\u0646\u060c \u0626\u0649\u0632\u0632\u06d5\u062a-\u06be\u06c6\u0631\u0645\u06d5\u062a \u06cb\u06d5 \u06be\u0648\u0642\u06c7\u0642\u062a\u0627 \u0628\u0627\u067e\u0628\u0627\u0631\u0627\u06cb\u06d5\u0631 \u0628\u0648\u0644\u06c7\u067e \u062a\u06c7\u063a\u06c7\u0644\u063a\u0627\u0646. \u0626\u06c7\u0644\u0627\u0631 \u0626\u06d5\u0642\u0649\u0644\u063a\u06d5 \u06cb\u06d5 \u06cb\u0649\u062c\u062f\u0627\u0646\u063a\u0627 \u0626\u0649\u06af\u06d5 \u06be\u06d5\u0645\u062f\u06d5 \u0628\u0649\u0631-\u0628\u0649\u0631\u0649\u06af\u06d5 \u0642\u06d0\u0631\u0649\u0646\u062f\u0627\u0634\u0644\u0649\u0642 \u0645\u06c7\u0646\u0627\u0633\u0649\u06cb\u0649\u062a\u0649\u06af\u06d5 \u062e\u0627\u0633 \u0631\u0648\u06be \u0628\u0649\u0644\u06d5\u0646 \u0645\u0648\u0626\u0627\u0645\u0649\u0644\u06d5 \u0642\u0649\u0644\u0649\u0634\u0649 \u0643\u06d0\u0631\u06d5\u0643.", + "text": "Uyghur (Arabic) ھەممە ئادەم زانىدىنلا ئەركىن، ئىززەت-ھۆرمەت Û‹Û• ھوقۇقتا باپباراۋەر بولۇپ تۇغۇلغان. ئۇلار ئەقىلغە Û‹Û• ۋىجدانغا ئىگە ھەمدە بىر-بىرىگە Ù‚ÛØ±Ù‰Ù†Ø¯Ø§Ø´Ù„ىق مۇناسىۋىتىگە خاس روھ بىلەن موئامىلە قىلىشى ÙƒÛØ±Û•Ùƒ.", "metadata": { "languages": [ "ara" @@ -10592,7 +10592,7 @@ { "type": "NarrativeText", "element_id": "c9695addaae400cf93180490aae4c5b8", - "text": "Uyghur (Latin) hemme adem zatidinla erkin, izzet-h\u00f6rmet we hoquqta babbarawer bolup tughulghan. ular eqilghe we wijdan'gha ige hemde bir-birige q\u00e9rindashliq munasiwitige xas roh bilen muamile qilishi k\u00e9rek.", + "text": "Uyghur (Latin) hemme adem zatidinla erkin, izzet-hörmet we hoquqta babbarawer bolup tughulghan. ular eqilghe we wijdan'gha ige hemde bir-birige qĂ©rindashliq munasiwitige xas roh bilen muamile qilishi kĂ©rek.", "metadata": { "languages": [ "nld", @@ -10616,7 +10616,7 @@ { "type": "NarrativeText", "element_id": "cf037543ae7e29089220134bd8d9fc80", - "text": "Uzbek, Northern (Cyrillic) \u0411\u0430\u0440\u0447\u0430 \u043e\u0434\u0430\u043c\u043b\u0430\u0440 \u044d\u0440\u043a\u0438\u043d, \u049b\u0430\u0434\u0440\u2010\u049b\u0438\u043c\u043c\u0430\u0442 \u0432\u0430 \u04b3\u0443\u049b\u0443\u049b\u043b\u0430\u0440\u0434\u0430 \u0442\u0435\u043d\u0433 \u0431\u045e\u043b\u0438\u0431 \u0442\u0443\u0493\u0438\u043b\u0430\u0434\u0438\u043b\u0430\u0440. \u0423\u043b\u0430\u0440 \u0430\u049b\u043b \u0432\u0430 \u0432\u0438\u0436\u0434\u043e\u043d \u0441\u043e\u04b3\u0438\u0431\u0438\u0434\u0438\u0440\u043b\u0430\u0440 \u0432\u0430 \u0431\u0438\u0440\u2010\u0431\u0438\u0440\u043b\u0430\u0440\u0438\u0433\u0430 \u0431\u0438\u0440\u043e\u0434\u0430\u0440\u043b\u0430\u0440\u0447\u0430 \u043c\u0443\u043e\u043c\u0430\u043b\u0430 \u049b\u0438\u043b\u0438\u0448\u043b\u0430\u0440\u0438 \u0437\u0430\u0440\u0443\u0440.", + "text": "Uzbek, Northern (Cyrillic) Đ‘Đ°Ñ€Ñ‡Đ° Đ¾Đ´Đ°Đ¼Đ»Đ°Ñ€ ÑÑ€ĐºĐ¸Đ½, ̉›Đ°Đ´Ñ€â€̉›Đ¸Đ¼Đ¼Đ°Ñ‚ Đ²Đ° ̉³Ñƒ̉›Ñƒ̉›Đ»Đ°Ñ€Đ´Đ° Ñ‚ĐµĐ½Đ³ бÑлиб ту̉“Đ¸Đ»Đ°Đ´Đ¸Đ»Đ°Ñ€. Đ£Đ»Đ°Ñ€ а̉›Đ» Đ²Đ° Đ²Đ¸Đ¶Đ´Đ¾Đ½ ÑĐ¾̉³Đ¸Đ±Đ¸Đ´Đ¸Ñ€Đ»Đ°Ñ€ Đ²Đ° Đ±Đ¸Ñ€â€Đ±Đ¸Ñ€Đ»Đ°Ñ€Đ¸Đ³Đ° Đ±Đ¸Ñ€Đ¾Đ´Đ°Ñ€Đ»Đ°Ñ€Ñ‡Đ° Đ¼ÑƒĐ¾Đ¼Đ°Đ»Đ° ̉›Đ¸Đ»Đ¸ÑˆĐ»Đ°Ñ€Đ¸ Đ·Đ°Ñ€ÑƒÑ€.", "metadata": { "languages": [ "mkd" @@ -10637,7 +10637,7 @@ { "type": "NarrativeText", "element_id": "f96f007fae71f3dbb5cf107a67339f62", - "text": "Uzbek, Northern (Latin) Barcha odamlar erkin, qadr\u2010qimmat va huquqlarda teng bo\u02bblib tug\u02bbiladilar. Ular aql va vijdon sohibidirlar va bir\u2010birlariga birodarlarcha muomala qilishlari zarur.", + "text": "Uzbek, Northern (Latin) Barcha odamlar erkin, qadrâ€qimmat va huquqlarda teng boÊ»lib tugÊ»iladilar. Ular aql va vijdon sohibidirlar va birâ€birlariga birodarlarcha muomala qilishlari zarur.", "metadata": { "languages": [ "tur", @@ -10659,7 +10659,7 @@ { "type": "NarrativeText", "element_id": "4309a801882998d4a87ec4393c62eb5b", - "text": "Vai \ua549\ua55c\ua56e \ua514\ua60b \ua5b8 \ua530 \ua5cb\ua60b \ua56e\ua568 \ua514\ua60b \ua5b8 \ua54e \ua549\ua5b8\ua54a \ua574\ua583 \ua543\ua524\ua602 \ua5f1, \ua549\ua5b7 \ua5ea\ua5e1 \ua53b\ua524 \ua5cf\ua5d2\ua5e1 \ua54e \ua5ea \ua549\ua5b8\ua54a \ua58f\ua54e. \ua549\ua561 \ua58f \ua5f3\ua56e\ua54a \ua5cf \ua56a \ua5d3 \ua549\ua5b7 \ua549\ua5b8 \ua558\ua55e \ua5ea. \ua58f\ua5b7 \ua549\ua5b8\ua527 \ua58f \ua5b8 \ua55a\ua54c\ua602 \ua5f7\ua524 \ua55e \ua603\ua5b7 \ua609\ua527 \ua5e0\ua5bb \ua55e \ua5b4\ua60b \ua533\ua569 \ua549\ua5b8 \ua5f3.", + "text": "Vai ꕉꕜꕮ ꔔꘋ ê–¸ ê”° ꗋꘋ ꕮꕨ ꔔꘋ ê–¸ ê• ê•‰ê–¸ê• ê•´ê–ƒ ꕃꔤꘂ ê—±, ꕉꖷ ꗪꗡ ꔻꔤ ê—ê—’ê—¡ ê• ê—ª ê•‰ê–¸ê• ê–ê•. ꕉꕡ ê– ê—³ê•®ê• ê— ê•ª ê—“ ꕉꖷ ꕉꖸ ê•˜ê• ê—ª. ê–ê–· ꕉꖸꔧ ê– ê–¸ ê•ꕌꘂ ꗷꔤ ê• ê˜ƒê–· ꘉꔧ ê— ê–» ê• ê–´ê˜‹ ꔳꕩ ꕉꖸ ê—³.", "metadata": { "filetype": "text/plain", "data_source": { @@ -10677,7 +10677,7 @@ { "type": "NarrativeText", "element_id": "8874ff5275f95f22ade2d05b19b84596", - "text": "Venda Vhathu vho\u1e71he vha bebwa vhe na mbofholowo nahone vha tshi lingana siani \u1e3da tshirunzi na pfanelo. Vhathu vho\u1e71he vho \u1e4bewa mihumbulo na mvalo ngauralo vha tea u konou farana sa vhathu vhathihi.", + "text": "Venda Vhathu vhoá¹±he vha bebwa vhe na mbofholowo nahone vha tshi lingana siani ḽa tshirunzi na pfanelo. Vhathu vhoá¹±he vho ṋewa mihumbulo na mvalo ngauralo vha tea u konou farana sa vhathu vhathihi.", "metadata": { "languages": [ "swa" @@ -10698,7 +10698,7 @@ { "type": "NarrativeText", "element_id": "1b4e3e7ad00ef96ec0938e98c22ac4d7", - "text": "Venda Vhathu vho\u1e71he vha bebwa vhe na mbofholowo nahone vha tshi lingana siani \u1e3da tshirunzi na pfanelo. Vhathu vho\u1e71he vho \u1e4bewa mihumbulo na mvalo ngauralo vha tea u konou farana sa vhathu vhathihi.", + "text": "Venda Vhathu vhoá¹±he vha bebwa vhe na mbofholowo nahone vha tshi lingana siani ḽa tshirunzi na pfanelo. Vhathu vhoá¹±he vho ṋewa mihumbulo na mvalo ngauralo vha tea u konou farana sa vhathu vhathihi.", "metadata": { "languages": [ "swa" @@ -10719,7 +10719,7 @@ { "type": "NarrativeText", "element_id": "57f8d88a5300439c2e78d95d9954dd1b", - "text": "Venetian Tuti i \u00e8sari umani i nase \u0142\u00ecbari e conpanji par dinjit\u00e0 e deriti. I ze dot\u00e0i de rajon e de cosiensa e i ga da conportarse intr\u00e0 de \u0142ori co sp\u00ecrito de frade\u0142i.", + "text": "Venetian Tuti i èsari umani i nase Å‚Ă¬bari e conpanji par dinjitĂ  e deriti. I ze dotĂ i de rajon e de cosiensa e i ga da conportarse intrĂ  de Å‚ori co spìrito de fradeÅ‚i.", "metadata": { "languages": [ "ita", @@ -10741,7 +10741,7 @@ { "type": "NarrativeText", "element_id": "bde94a10001841ef9fad0f19311e6fa9", - "text": "Veps Kaik mehed su\u0308nduba joudajin i kohtai\u017ein, u\u0308hteji\u010d\u010din i\u010deze arvokahudes i oiktusi\u0161. Heile om anttud mel\u2019 i huiktusentund i heile tari\u017e ko\u017euda toine toi\u017eenke kut vel\u2019l\u2019kundad.", + "text": "Veps Kaik mehed sùˆnduba joudajin i kohtaižin, ùˆhtejiÄÄin iÄeze arvokahudes i oiktusiÅ¡. Heile om anttud mel’ i huiktusentund i heile tariž kožuda toine toiženke kut vel’l’kundad.", "metadata": { "languages": [ "est", @@ -10763,7 +10763,7 @@ { "type": "NarrativeText", "element_id": "c6836fc94a9a2261da5605eae88ea21f", - "text": "Vietnamese T\u00e2\u0301t ca\u0309 mo\u0323i ng\u01b0\u01a1\u0300i sinh ra \u0111\u00ea\u0300u \u0111\u01b0\u01a1\u0323c t\u01b0\u0323 do va\u0300 bi\u0300nh \u0111\u0103\u0309ng v\u00ea\u0300 nh\u00e2n ph\u00e2\u0309m va\u0300 quy\u00ea\u0300n. Mo\u0323i con ng\u01b0\u01a1\u0300i \u0111\u00ea\u0300u \u0111\u01b0\u01a1\u0323c ta\u0323o hoa\u0301 ban cho ly\u0301 tri\u0301 va\u0300 l\u01b0\u01a1ng t\u00e2m va\u0300 c\u00e2\u0300n pha\u0309i \u0111\u00f4\u0301i x\u01b0\u0309 v\u01a1\u0301i nhau trong ti\u0300nh b\u0103\u0300ng h\u01b0\u0303u.", + "text": "Vietnamese TĂ¢̀t cà‰ mò£i ngườ€i sinh ra Ä‘Ăª̀€u đườ£c từ£ do và€ bì€nh đằ‰ng vĂª̀€ nhĂ¢n phĂ¢̀‰m và€ quyĂª̀€n. Mò£i con ngườ€i Ä‘Ăª̀€u đườ£c tà£o hoà ban cho lỳ trì và€ lương tĂ¢m và€ cĂ¢̀€n phà‰i đồi xừ‰ vÆ¡̀i nhau trong tì€nh bằ€ng hừƒu.", "metadata": { "languages": [ "vie" @@ -10805,7 +10805,7 @@ { "type": "Title", "element_id": "294055dfb0c1131395070d727e81fde6", - "text": "\u7562\u54ff\u6bcf\ud840\ude9b\u751f\ud841\udea2\u8abf\u5f97\u81ea\u7531\u5427\u5e73\u7b49\ud85d\uddf1\u4eba\u54c1\u5427\u6b0a\u3002\u6bcf\ud846\udd75\ud840\ude9b\u8abf\u5f97\u9020\u5316\u9812\u6731\u7406\u667a\u5427\u826f\u5fc3\u5427\u52e4\u6c9b\u5c0d\u8655\ud84a\udf72\u81ae\ud856\ude9d\u60c5\u670b\u53cb\u3002", + "text": "畢哿æ¯đ ›ç”Ÿđ ¢èª¿å¾—自由å§å¹³ç­‰đ§—±äººå“å§æ¬ă€‚æ¯đ¡¥µđ ›èª¿å¾—é€ åŒ–é ’æœ±ç†æ™ºå§è‰¯å¿ƒå§å‹¤æ²›å°è™•đ¢­²è†®đ¥ªæƒ…朋å‹ă€‚", "metadata": { "languages": [ "kor", @@ -10827,7 +10827,7 @@ { "type": "NarrativeText", "element_id": "4ab64de143568003ad62ca2cf3c8cda3", - "text": "Waama Yiriba na b\u00e0 sikindo dare b\u00e0 m\u025b\u025bri, da seena yirimma mii b\u00e0 ta da i n\u025bki b\u00e0 t\u0254\u0254ba.", + "text": "Waama Yiriba na bĂ  sikindo dare bĂ  mɛɛri, da seena yirimma mii bĂ  ta da i nÉ›ki bĂ  tɔɔba.", "metadata": { "languages": [ "som", @@ -10849,7 +10849,7 @@ { "type": "NarrativeText", "element_id": "88700f6c9f719c0f7ad537b0fe24d46d", - "text": "Walloon Tos l\u00e8s-omes vin\u00e8t-st-\u00e5 monde l\u00eebes, \u00e8t so-l'minme p\u00eed po \u00e7ou qu'\u00e8nn'\u00e8st d'leu dignit\u00e9 \u00e8t d'leus dre\u00fbts. I n'sont nin fo\u00fb r\u00eazon \u00e8t-z-ont-i le\u00fb consyince po z\u00e8ls, \u00e7ou qu'\u00e8lz\u00e8s de\u00fbt miner a s'kid\u00fbre onk' po l'\u00f4te tot come d\u00e8s fr\u00e9s.", + "text": "Walloon Tos lès-omes vinèt-st-Ă¥ monde lĂ®bes, èt so-l'minme pĂ®d po çou qu'ènn'èst d'leu dignitĂ© èt d'leus dreĂ»ts. I n'sont nin foĂ» rĂªzon èt-z-ont-i leĂ» consyince po zèls, çou qu'èlzès deĂ»t miner a s'kidĂ»re onk' po l'Ă´te tot come dès frĂ©s.", "metadata": { "languages": [ "fra" @@ -10913,7 +10913,7 @@ { "type": "NarrativeText", "element_id": "25c9bb862536e9e520792ea8724608de", - "text": "Wayuu Naa wayuukana jemeishi s\u00fcp\u00fcla taashi s\u00fcma wanawa s\u00fclu'u nakua'ipa, aka m\u00fcin yaa epijainjana s\u00fcnain anajiranawaa a'in nama nap\u00fcshi.", + "text": "Wayuu Naa wayuukana jemeishi sĂ¼pĂ¼la taashi sĂ¼ma wanawa sĂ¼lu'u nakua'ipa, aka mĂ¼in yaa epijainjana sĂ¼nain anajiranawaa a'in nama napĂ¼shi.", "metadata": { "languages": [ "swa" @@ -10934,7 +10934,7 @@ { "type": "NarrativeText", "element_id": "b4265fbb8924aeeb84569e7b2e4e3197", - "text": "Welsh Genir pawb yn rhydd ac yn gydradd \u00e2\u2019i gilydd mewn urddas a hawliau. Fe\u2019u cynysgaeddir \u00e2 rheswm a chydwybod, a dylai pawb ymddwyn y naill at y llall mewn ysbryd cymodlon.", + "text": "Welsh Genir pawb yn rhydd ac yn gydradd Ă¢â€™i gilydd mewn urddas a hawliau. Fe’u cynysgaeddir Ă¢ rheswm a chydwybod, a dylai pawb ymddwyn y naill at y llall mewn ysbryd cymodlon.", "metadata": { "languages": [ "cym" @@ -10955,7 +10955,7 @@ { "type": "NarrativeText", "element_id": "8799ac3c8264dbd02b24e5484e28ea2d", - "text": "Wolof Doomi aadama y\u00e9pp danuy juddu, yam ci tawfeex ci sag ak sa\u00f1-sa\u00f1. Nekk na it ku xam d\u00ebgg te \u00e0nd na ak xelam, te war naa j\u00ebflante ak nawleen, te teg ko ci w\u00e0llu mbokk.", + "text": "Wolof Doomi aadama yĂ©pp danuy juddu, yam ci tawfeex ci sag ak sañ-sañ. Nekk na it ku xam dĂ«gg te Ă nd na ak xelam, te war naa jĂ«flante ak nawleen, te teg ko ci wĂ llu mbokk.", "metadata": { "languages": [ "ind", @@ -11022,7 +11022,7 @@ { "type": "NarrativeText", "element_id": "b1da3b28878be3ee9c9045f0c9223c84", - "text": "Yakut \u0414\u044c\u043e\u043d \u0431\u0430\u0440\u044b\u0442\u0430 \u0431\u044d\u0439\u044d \u0441\u0443\u043e\u043b\u0442\u0430\u0442\u044b\u0433\u0430\u0440 \u0443\u043e\u043d\u043d\u0430 \u0431\u044b\u0440\u0430\u0430\u0431\u044b\u0433\u0430\u0440 \u0442\u044d\u04a5 \u0431\u0443\u043e\u043b\u0430\u043d \u0442\u04e9\u0440\u04af\u04af\u043b\u043b\u044d\u0440. \u041a\u0438\u043d\u0438\u043b\u044d\u0440 \u0431\u0430\u0440\u044b \u04e9\u0440\u043a\u04e9\u043d \u04e9\u0439\u0434\u04e9\u04e9\u0445, \u0441\u0443\u043e\u0431\u0430\u0441\u0442\u0430\u0430\u0445 \u0431\u0443\u043e\u043b\u0430\u043d \u0442\u04e9\u0440\u04af\u04af\u043b\u043b\u044d\u0440, \u0443\u043e\u043d\u043d\u0430 \u0431\u044d\u0439\u044d \u0431\u044d\u0439\u044d\u043b\u044d\u0440\u0438\u0433\u044d\u0440 \u0442\u044b\u043b\u0433\u0430 \u043a\u0438\u0438\u0440\u0438\u043d\u0438\u0433\u044d\u0441 \u0431\u044b\u04bb\u044b\u044b\u043b\u0430\u0440\u0430 \u0434\u043e\u0495\u043e\u0440\u0434\u043e\u04bb\u0443\u0443 \u0442\u044b\u044b\u043d\u043d\u0430\u0430\u0445 \u0431\u0443\u043e\u043b\u0443\u043e\u0445\u0442\u0430\u0430\u0445.", + "text": "Yakut Đ”ÑŒĐ¾Đ½ Đ±Đ°Ñ€Ñ‹Ñ‚Đ° бÑĐ¹Ñ ÑÑƒĐ¾Đ»Ñ‚Đ°Ñ‚Ñ‹Đ³Đ°Ñ€ ÑƒĐ¾Đ½Đ½Đ° Đ±Ñ‹Ñ€Đ°Đ°Đ±Ñ‹Đ³Đ°Ñ€ Ñ‚Ñ̉¥ Đ±ÑƒĐ¾Đ»Đ°Đ½ төр̉¯̉¯Đ»Đ»ÑÑ€. ĐĐ¸Đ½Đ¸Đ»ÑÑ€ Đ±Đ°Ñ€Ñ‹ Ó©Ñ€ĐºÓ©Đ½ Ó©Đ¹Đ´Ó©Ó©Ñ…, ÑÑƒĐ¾Đ±Đ°ÑÑ‚Đ°Đ°Ñ… Đ±ÑƒĐ¾Đ»Đ°Đ½ төр̉¯̉¯Đ»Đ»ÑÑ€, ÑƒĐ¾Đ½Đ½Đ° бÑĐ¹Ñ Đ±ÑĐ¹ÑĐ»ÑÑ€Đ¸Đ³ÑÑ€ Ñ‚Ñ‹Đ»Đ³Đ° ĐºĐ¸Đ¸Ñ€Đ¸Đ½Đ¸Đ³ÑÑ Đ±Ñ‹̉»Ñ‹Ñ‹Đ»Đ°Ñ€Đ° Đ´Đ¾̉•Đ¾Ñ€Đ´Đ¾̉»ÑƒÑƒ Ñ‚Ñ‹Ñ‹Đ½Đ½Đ°Đ°Ñ… Đ±ÑƒĐ¾Đ»ÑƒĐ¾Ñ…Ñ‚Đ°Đ°Ñ….", "metadata": { "languages": [ "rus" @@ -11043,7 +11043,7 @@ { "type": "NarrativeText", "element_id": "53f4d4779755796c4b53e9945f211ced", - "text": "Yanesha\u02bc Allohueney \u00f1e\u00f1tey arrom\u00f1atey att\u0303o ye'\u00f1alletyesa arr patsro e'\u00f1e att\u0303ecma cohuen yesherb\u0303a'yen. \u00d1am\u0303a yechyen allpon derechos att\u0303och e'\u00f1ech cohueno'tsa'yeney arr patsro. \u00d1am\u0303a allohuen att\u0303ecma yechyen alloch yoct\u0303ape' chyen cohuen \u00f1am\u0303a ye\u00f1otyen yeyoc\u0308hro \u00f1e\u00f1t \u0303e'ne pocte' enten ache\u00f1enesha' \u00f1am\u0303a \u00f1e\u00f1t \u0303ama pocteye' enteneto. Ye\u00f1ote\u00f1 a\u00f1 poctetsa e'\u00f1e yemo'nashe\u00f1 yep\u0303annena ama't ora allohuen allpon ache\u00f1enesha' \u00f1e\u00f1t \u0303a\u00f1e patsro'tsa'yeney.", + "text": "Yaneshaʼ Allohueney ñeñtey arromñatey att̀ƒo ye'ñalletyesa arr patsro e'ñe att̀ƒecma cohuen yesherb̀ƒa'yen. Ă‘am̀ƒa yechyen allpon derechos att̀ƒoch e'ñech cohueno'tsa'yeney arr patsro. Ă‘am̀ƒa allohuen att̀ƒecma yechyen alloch yoct̀ƒape' chyen cohuen ñam̀ƒa yeñotyen yeyoc̀ˆhro ñeñt ̀ƒe'ne pocte' enten acheñenesha' ñam̀ƒa ñeñt ̀ƒama pocteye' enteneto. Yeñoteñ añ poctetsa e'ñe yemo'nasheñ yep̀ƒannena ama't ora allohuen allpon acheñenesha' ñeñt ̀ƒañe patsro'tsa'yeney.", "metadata": { "languages": [ "spa", @@ -11065,7 +11065,7 @@ { "type": "NarrativeText", "element_id": "1484d1c7c562268257922f9f0522d183", - "text": "Yanomam\u00f6 K\u00f5mi th\u00eb p\u00eb r\u00eb p\u00ebripraw\u00eb r\u00eb piy\u00ebk\u00ebi, he usukuw\u00eb th\u00eb p\u00eb keprou ai th\u00eb \u00e3 r\u00ebamaih\u00e3 no \u00e3 heparohow\u00eb, totihitaw\u00eb th\u00eb p\u00eb ri\u00e3 r\u1ebd thaiwehei hami, th\u00eb p\u00eb puhi tao k\u00e3i p\u00ebrihiw\u00ebha, th\u00eb p\u00eb puhi k\u00e3i katehew\u00ebha haw\u00eb kama th\u00eb p\u00eb mashi sh\u0129ro p\u00ebrihimop\u00eb.", + "text": "Yanomamö Kõmi thĂ« pĂ« rĂ« pĂ«riprawĂ« rĂ« piyĂ«kĂ«i, he usukuwĂ« thĂ« pĂ« keprou ai thĂ« Ă£ rĂ«amaihĂ£ no Ă£ heparohowĂ«, totihitawĂ« thĂ« pĂ« riĂ£ rẽ thaiwehei hami, thĂ« pĂ« puhi tao kĂ£i pĂ«rihiwĂ«ha, thĂ« pĂ« puhi kĂ£i katehewĂ«ha hawĂ« kama thĂ« pĂ« mashi shÄ©ro pĂ«rihimopĂ«.", "metadata": { "languages": [ "sqi" @@ -11107,7 +11107,7 @@ { "type": "NarrativeText", "element_id": "6e2772e24613e482dbe3ec725643ea7a", - "text": "Yapese Gubine gidii mani gargeleg nga faileng nibapuf matt\u02bcawen nge rogon. Bay laniyan nipii e nam, ere ngauda ted matt\u02bcaawen e chaa niba chugur ngoded nimod walag dad.", + "text": "Yapese Gubine gidii mani gargeleg nga faileng nibapuf mattʼawen nge rogon. Bay laniyan nipii e nam, ere ngauda ted mattʼaawen e chaa niba chugur ngoded nimod walag dad.", "metadata": { "languages": [ "tgl", @@ -11129,7 +11129,7 @@ { "type": "NarrativeText", "element_id": "dd0ec8c9f26cfc60d56857c55e78705f", - "text": "Yiddish, Eastern \u05d9\u05e2\u05d3\u05e2\u05e8 \u05de\u05e2\u05e0\u05d8\u05e9 \u05f0\u05e2\u05e8\u05d8 \u05d2\u05e2\u05d1\u05f1\u05e8\u05df \u05e4\u05bf\u05e8\u05f2\u05b7 \u05d0\u05d5\u05df \u05d2\u05dc\u05f2\u05b7\u05da \u05d0\u05d9\u05df \u05db\u05bc\u05d1\u05bf\u05d5\u05d3 \u05d0\u05d5\u05df \u05e8\u05e2\u05db\u05d8. \u05d9\u05e2\u05d3\u05e2\u05e8 \u05f0\u05e2\u05e8\u05d8 \u05d1\u05d0\u05b7\u05e9\u05d0\u05b8\u05e0\u05e7\u05df \u05de\u05d9\u05d8 \u05e4\u05bf\u05d0\u05b7\u05e8\u05e9\u05d8\u05d0\u05b7\u05e0\u05d3 \u05d0\u05d5\u05df \u05d2\u05e2\u05f0\u05d9\u05e1\u05df; \u05d9\u05e2\u05d3\u05e2\u05e8 \u05d6\u05d0\u05b8\u05dc \u05d6\u05d9\u05da \u05e4\u05bf\u05d9\u05e8\u05df \u05de\u05d9\u05d8 \u05d0\u05b7 \u05e6\u05f0\u05f2\u05d8\u05df \u05d0\u05d9\u05df \u05d0\u05b7 \u05d2\u05e2\u05de\u05d9\u05d8 \u05e4\u05bf\u05d5\u05df \u05d1\u05e8\u05d5\u05d3\u05e2\u05e8\u05e9\u05d0\u05b7\u05e4\u05bf\u05d8.", + "text": "Yiddish, Eastern יעדער ×ענטש װערט געבױרן פֿרײַ ×ון ×’×œ×²Ö·× ×ין כּבֿוד ×ון רעכט. יעדער װערט ב×ַש×ָנקן ×יט פֿ×ַרשט×ַנד ×ון געװיסן; יעדער ×–×ָל ×–×™× ×¤Ö¿×™×¨×Ÿ ×יט ×Ö· צװײטן ×ין ×Ö· ×’×¢×יט פֿון ברודערש×ַפֿט.", "metadata": { "languages": [ "heb" @@ -11150,7 +11150,7 @@ { "type": "NarrativeText", "element_id": "33533cecec6c5714680925cbc9d55bb1", - "text": "Yoruba Gbogbo \u00e8n\u00ecy\u00e0n ni a b\u00ed n\u00ed \u00f2m\u00ecnira; iy\u00ec \u00e0ti \u1eb9\u0300t\u1ecd\u0301 k\u1ecd\u0300\u1ecd\u0300kan s\u00ec d\u1ecd\u0301gba. W\u1ecd\u0301n n\u00ed \u1eb9\u0300b\u00f9n ti l\u00e0\u00e1k\u00e0y\u00e8 \u00e0ti ti \u1eb9\u0300r\u00ed\u2010\u1ecdk\u00e0n, \u00f3 s\u00ec y\u1eb9 k\u00ed w\u1ecdn \u00f3 m\u00e1a h\u00f9w\u00e0 s\u00ed ara w\u1ecdn g\u1eb9\u0301g\u1eb9\u0301 b\u00ed \u1ecdm\u1ecd \u00ecy\u00e1.", + "text": "Yoruba Gbogbo ènìyĂ n ni a bĂ­ nĂ­ Ă²mìnira; iyì Ă ti ẹ̀€tá»̀ ká»̀€á»̀€kan sì dá»̀gba. Wá»̀n nĂ­ ẹ̀€bĂ¹n ti lĂ Ă¡kĂ yè Ă ti ti ẹ̀€rĂ­â€á»kĂ n, Ă³ sì yẹ kĂ­ wá»n Ă³ mĂ¡a hĂ¹wĂ  sĂ­ ara wá»n gẹ̀gẹ̀ bĂ­ á»mỠìyĂ¡.", "metadata": { "languages": [ "vie" @@ -11171,7 +11171,7 @@ { "type": "NarrativeText", "element_id": "263ae4a61b51cca14085f92de5a8cfa5", - "text": "Yukaghir, Northern \u041a\u04e9\u0434\u044d\u04a5 \u0442\u044d\u043d - \u043d\u044c\u0438\u0434\u0438\u0442\u044d \u0431\u0430\u043d\u0434\u044c\u044d \u043f\u0430\u0440\u0430\u051d\u0430\u0430\u043d\u044c\u044d\u0440\u044d\u04a5 \u0442\u0443\u0434\u044d \u0447\u0443\u04a5\u0434\u044d\u043d \u043d\u044c\u0438\u043b\u0434\u044c\u0438\u043b\u044d\u043a \u044d\u043d\u043d\u0443\u043b\u04a5\u0438\u043d\u044c-\u043c\u044d\u0434\u044c\u0443\u043e\u043b\u043d\u0443\u043d\u0438. \u041a\u04e9\u0434\u044d\u04a5 \u044d\u043d\u043c\u0443\u043d \u0447\u0443\u043d\u0434\u044d \u043c\u044d \u043b\u044c\u044d\u0439, \u0442\u0430\u0430\u0442\u043b\u044c\u044d\u0440 \u043b\u0443\u043a\u0443\u043d\u0434\u044c\u0438\u0438 \u043d\u044c\u0438\u043d\u044d\u043c\u0434\u044c\u0438\u0439\u0438\u043b\u043f\u044d \u0434\u0438\u0442\u044d \u044d\u043d\u043d\u0443\u0439\u0443\u043e\u043b-\u043c\u043e\u0440\u0430\u051d\u043d\u044c\u044d\u04a5\u0438.", + "text": "Yukaghir, Northern ĐÓ©Đ´Ñ̉¥ Ñ‚ÑĐ½ - Đ½ÑŒĐ¸Đ´Đ¸Ñ‚Ñ Đ±Đ°Đ½Đ´ÑŒÑ Đ¿Đ°Ñ€Đ°ÔĐ°Đ°Đ½ÑŒÑÑ€Ñ̉¥ Ñ‚ÑƒĐ´Ñ Ñ‡Ñƒ̉¥Đ´ÑĐ½ Đ½ÑŒĐ¸Đ»Đ´ÑŒĐ¸Đ»ÑĐº ÑĐ½Đ½ÑƒĐ»̉¥Đ¸Đ½ÑŒ-Đ¼ÑĐ´ÑŒÑƒĐ¾Đ»Đ½ÑƒĐ½Đ¸. ĐÓ©Đ´Ñ̉¥ ÑĐ½Đ¼ÑƒĐ½ Ñ‡ÑƒĐ½Đ´Ñ Đ¼Ñ Đ»ÑŒÑĐ¹, Ñ‚Đ°Đ°Ñ‚Đ»ÑŒÑÑ€ Đ»ÑƒĐºÑƒĐ½Đ´ÑŒĐ¸Đ¸ Đ½ÑŒĐ¸Đ½ÑĐ¼Đ´ÑŒĐ¸Đ¹Đ¸Đ»Đ¿Ñ Đ´Đ¸Ñ‚Ñ ÑĐ½Đ½ÑƒĐ¹ÑƒĐ¾Đ»-Đ¼Đ¾Ñ€Đ°ÔĐ½ÑŒÑ̉¥Đ¸.", "metadata": { "languages": [ "rus" @@ -11192,7 +11192,7 @@ { "type": "UncategorizedText", "element_id": "5d93ef013b9a5b75709657ba49153ed9", - "text": "Z\u00e1paro Kawiriaja kayapuina ichaukui ta nuka pucha panicha kupanimajicha cha nuka nishima ikicha kiniana panicha tamanuka kanata ikimajicha.", + "text": "ZĂ¡paro Kawiriaja kayapuina ichaukui ta nuka pucha panicha kupanimajicha cha nuka nishima ikicha kiniana panicha tamanuka kanata ikimajicha.", "metadata": { "languages": [ "swa" @@ -11213,7 +11213,7 @@ { "type": "NarrativeText", "element_id": "7d1772a7cde57cf4033fb6ecd38d611b", - "text": "Zapotec, G\u00fcil\u00e1 Ra'ta ra bu:unny ra:aaly liebr c\u00ebhnn te'bloh deree'ch c\u00ebhnn dignidaa. Ra:alyne:erih gahll ri:e:eny c\u00ebhnn saalyb, chiru' na:a pahr ga:annza'crih loh sa'rih.", + "text": "Zapotec, GĂ¼ilĂ¡ Ra'ta ra bu:unny ra:aaly liebr cĂ«hnn te'bloh deree'ch cĂ«hnn dignidaa. Ra:alyne:erih gahll ri:e:eny cĂ«hnn saalyb, chiru' na:a pahr ga:annza'crih loh sa'rih.", "metadata": { "languages": [ "cym", @@ -11237,7 +11237,7 @@ { "type": "NarrativeText", "element_id": "efe41cb241efcd0774cf2f9bd328b778", - "text": "Zapotec, Miahuatl\u00e1n Diti mien ndied xa yent kuan nkie xa nak rieti xa diba xa rola.", + "text": "Zapotec, MiahuatlĂ¡n Diti mien ndied xa yent kuan nkie xa nak rieti xa diba xa rola.", "metadata": { "languages": [ "afr", @@ -11262,7 +11262,7 @@ { "type": "NarrativeText", "element_id": "b1bf6eb1c62dbb55df63d0dcd8595d2a", - "text": "Zarma Fayanka kulu no si adamayzey nda care game ra i burcintara nda i alhakey cediraw kayandiya\u014b fondo ra da i na i hay. I gonda lakkal, nda laasaabu, ka\u014b ga na\u014b i ma baafunay \u0272ayzetaray haali ra.", + "text": "Zarma Fayanka kulu no si adamayzey nda care game ra i burcintara nda i alhakey cediraw kayandiyaÅ‹ fondo ra da i na i hay. I gonda lakkal, nda laasaabu, kaÅ‹ ga naÅ‹ i ma baafunay ɲayzetaray haali ra.", "metadata": { "languages": [ "som" diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index 55c78ea5c0..b70eb15b34 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -222,7 +222,7 @@ { "type": "ListItem", "element_id": "1a174e104169cb41cf69393a9cdc0872", - "text": "4. Team science and scientific communication: \u201csoft\u201d skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.", + "text": "4. Team science and scientific communication: “soft†skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.", "metadata": { "languages": [ "eng" @@ -310,7 +310,7 @@ { "type": "NarrativeText", "element_id": "690b79e1d449426afb07ed40866a6bb6", - "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM\u2019s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with", + "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with", "metadata": { "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json deleted file mode 100644 index fdb1b1ff86..0000000000 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json +++ /dev/null @@ -1,4195 +0,0 @@ -[ - { - "type": "Header", - "element_id": "782cf07be8b3ab8f05188e479edb7f61", - "text": "Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "Data in Brief 22 ( 2019 ) 451 \u2013 457", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 0 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c3e4ba0411db419c34f27ae55762b1c1", - "text": "Contents lists available at ScienceDirect", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "ScienceDirect", - "url": "www.sciencedirect.com/science/journal/23523409", - "start_index": 28 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a983d2e46059a8605ebb1077994e6fa3", - "text": "Data in Brief", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "354cd2b49c1a201a5e91177a17f9b2a3", - "text": "journal homepage: www.elsevier.com/locate/dib", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "www . elsevier . com / locate / dib", - "url": "www.elsevier.com/locate/dib", - "start_index": 18 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "c1c1eeb08eba1d16beccf2034fc87bc8", - "text": "Data Article", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f1b37e8056f39eb82901f43f4fe0a239", - "text": "Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "1a4fcf35fcd5d2be9f843f0fb93f3d3e", - "text": "Omotayo Sanni n, Abimbola Patricia I. Popoola", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "418af174cd1457a5db9b88c3c4a33ce3", - "text": "Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "698747e1178c3e0ec15b2eb293e58565", - "text": "a r t i c l e i n f o", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "19e64efbeabe463d8d8a6f577d4c6be7", - "text": "a b s t r a c t", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "8e23ddc47eb2833b067fe61c9c413955", - "text": "Article history: Received 31 August 2018 Received in revised form 17 November 2018 Accepted 27 November 2018 Available online 30 November 2018", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2b0eb4fb8b32b5944bcf711f448ef19a", - "text": "Keywords: Corrosion Stainless steel Inhibitor Sulphuric acid", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "8930d3f5d6929e72cbe35523538fc807", - "text": "This data article contains data related to the research article entitled \u201cenhanced corrosion resistance of stainless steel Type 316 in sulphuric acid solution using eco-friendly waste product\u201d (Sanni et al., 2018). In this data article, a comprehensive effect of waste product and optimized process parameter of the inhibitor in 0.5 M H2SO4 solution was presented using weight loss and potentiody- the inhibitor namic polarization techniques. The presence of (egg shell powder) in\ufb02uenced corrosion resistance of stainless steel. Inhibition ef\ufb01ciency value of 94.74% was recorded as a result of inhibition of the steel by the ionized molecules of the inhibiting compound of the egg shell powder in\ufb02uencing the redox mechan- ism reactions responsible for corrosion and surface deterioration.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "aa8a123d8b7bf47bd15c389a6685d405", - "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "0757794849e2cca941b30b4e1e82cd4b", - "text": "Speci\ufb01cation table", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "bab7909d0362404432e0cc4f90049b3a", - "text": "Subject area More speci\ufb01c subject area Surface science and engineering Type of data", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "227863137634b2d549494fac759af715", - "text": "Materials engineering", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "3f88b0d8c42101ff25aeb213051cf81f", - "text": "Table and \ufb01gure", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b6664d832b0c853cff911e63ce738371", - "text": "n Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "9b655d4b82dc2b1d75b9c21c7b0fc7f8", - "text": "E-mail address: tayo.sanni@yahoo.com (O. Sanni).", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "tayo . sanni @ yahoo . com", - "url": "mailto:tayo.sanni@yahoo.com", - "start_index": 16 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "96e9fe2b2775d750918a6f92f0d3ad95", - "text": "https://doi.org/10.1016/j.dib.2018.11.134 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 0 - }, - { - "text": "https", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 0 - }, - { - "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 11 . 134", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 0 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "757b62f5ce8ceee7150b7ce16ea16c93", - "text": "452", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "fb14c87d94f1676010e46b776d688612", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "72155e648a45896b081904929fc91cc6", - "text": "How data were acquired", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a577cc1dfaa481812a9cff86c06d9835", - "text": "Data format Experimental factors", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "9b9d298aef0e8b4a83bca09152a07128", - "text": "Experimental features Data source location", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6f850529ced475435229c193a8ee7938", - "text": "Accessibility Related research article", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c1c91f3ea75c102b6ed42b94530cbafe", - "text": "The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition ef\ufb01ciency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225\u2013230.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a5dd74871d789945bd8a9c352d4817fb", - "text": "Value of the data", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "9bed69cd8287b2725bd845ca61ebb3cd", - "text": "(cid:1) Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "2ac3a042a8c89fd81718d1fda7ae576b", - "text": "(cid:1) The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "4962aa80bf0712155f4b781df06b4f1a", - "text": "(cid:1) The data can be used to examine the relationship between the process variable as it affect the", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "3b419c2d586d0eaf047f939c9e41b30f", - "text": "nature of inhibition of metals.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "f742be9cbb2d0697a88a9f749bf3185c", - "text": "1. Data", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "28d5b195997810a34c2aa96c9f357de2", - "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs.1\u20133 respectively. It can be seen clearly from these Figures that the ef\ufb01ciency of egg shell powder increase with the inhibitor con- centration, The increase in its ef\ufb01ciency could be as a result of increase in the constituent molecule", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f3a850e6bd8c0557408ad59167f5461e", - "text": ") g m", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "3cb4a395dab98ecdc71ad325411cf150", - "text": "(", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2b2ff92863f302ae630dc410b945333a", - "text": "s s o", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "0da3f5fd0fd07fc182d371760d9da3c0", - "text": "l", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f929b69f05a08ec2b940c9b531740326", - "text": "t h g e W", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f0fbafddf553bdea61ac009ad080f1bc", - "text": "i", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2b3d55b9ce69bcd15d67071cf0d11814", - "text": "30", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "9673d82062115826d94732418d566ba2", - "text": "20", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b0304d4851460afe7c95d41feb260093", - "text": "10g 8g 6g 4g 2g Control", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "7f646e71d7bc0398e9917eec2c29b9ef", - "text": "10", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "12a72cb263173964cf41736e5d3707b2", - "text": "48", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "673fe20c15c1210d134b56828c5a8216", - "text": "96", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "c552ee9963f985fd6b3498e2cf2c6230", - "text": "144", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "16e471ece5a33bfb80b79b89aed6c731", - "text": "192", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "829e97853a2843ff6a8f1cfd3a6c74db", - "text": "Exposure Time (Hours)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "b6f97c1cdf0e9f1abebac577d4cf4b2a", - "text": "Fig. 1. Weight loss versus exposure time for stainless steel presence of ES.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "09a5818257d4c970dc57191f38e1c1b0", - "text": "immersed in 0.5 M H2SO4 solution in the absence and", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "828e27fb21b2ca5e25ebdc5f0693ed7d", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "81cbf4e59dfe4444a94794a547e9063c", - "text": "2.7", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "f1b0da24500b1f98c9debd55a2482b7f", - "text": ") r a e y / m m", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "9efd31c777cb3a30d24545982e71644e", - "text": "( e t a r n o s o r r o C", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a535b571914bff036ee8d7b941a9e14c", - "text": "i", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "6445348d57f8715d980bbf266f6cc4b3", - "text": "1.8", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "dff5188d0e9db124ca45b71e4123404f", - "text": "0.9", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2e8665917db0a5ca56fee4e99f113c05", - "text": "10g 8g 6g 4g 2g Control", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "9b38508e1e3ddd8056482945216e1a28", - "text": "24", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "4638ab00ad25c2044ed18ba57b766d7d", - "text": "48", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "252b95fc79d992358f5e7e4423febe14", - "text": "72", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "963002fc37d4568e01e1361b0f053b53", - "text": "96", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "292f8084988c4f4000fcd5bd2205c36a", - "text": "120", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "5c317addf6947e11fba4c4f584f095c1", - "text": "144", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "95649afacb76442d050ed4534b80c4cc", - "text": "168", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "dad2b03f8f9d732efa19ab6a421e971d", - "text": "192", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "8f500e748d82811ccbb3b715e1932be6", - "text": "Exposure time", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "03f95f2413bbe205cdc6975b1b98ecbe", - "text": "Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "3c32d78e905ba61d1ae55e0b2ebd5946", - "text": "100", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "78e1f4ff627e16f8159327279bdfcce0", - "text": "90", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "748c1e92cccf809f3776382792e93895", - "text": ")", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "feccbab23ec407ef6cc22348a78244d3", - "text": "%", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "03ac492dccd89cf13a9d40ada0e543e1", - "text": "(", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "2a02254b1d03abddd3537dc16c56a6fb", - "text": "y c n e c i f f", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "67504491ab6c6c3603a75d246c50f54d", - "text": "i", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "6a2c597e6f8cfa0954a022873f9dcf6f", - "text": "E n o i t i b h n I", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f84aae3bf521f4166f63e87b5ef4f035", - "text": "i", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "b76e96beb931beaef6e3660f5d415c3d", - "text": "80", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "0309a67bcfd5df32328af8c537c708e6", - "text": "70", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "33add4c83afdffa0745406aea3c75b49", - "text": "60", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e180205da17abbe716978d5c4aa4dd03", - "text": "50", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "18f47de0e9dbec383a50a39027960bc6", - "text": "40", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "89ac5d03f7c6d4fa92bda587be577ab8", - "text": "30", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "93a1080514211ba59a1850d5600c261c", - "text": "2g 4g 6g 8g 10g", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a66d7b20adfb12a1efd70da1d5b65375", - "text": "20", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "82bf75b4e447974f22e48c9a450c45d5", - "text": "10", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "d460a5ac4c345529812f84dabf681d9f", - "text": "0", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a6282e95f41f8cb5061e0618a02dc09a", - "text": "20", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "44e027245f6667d8282ec4728ad9c2dd", - "text": "40", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "935862a8bb1abed65afc07fc8d1da166", - "text": "60", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "fada482b9f03a3eda9be2ad92169bc9a", - "text": "80", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "3179f53a093e5bb8064b777a8125c88e", - "text": "100", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2053a3a5b1e12481504583f7f72979ff", - "text": "120", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "b81dbb6336d2b992478316f8514e94b6", - "text": "140", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "d4eb5e157598e6fa21a6b5b4254e9b5e", - "text": "160", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "f082a93dce4872ddd5ecc97c3a9341fb", - "text": "180", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "4c19db10f909537bf29da9829ab6f81b", - "text": "Exposure Time (Hours)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c566a56fa9e9ad6b97408310e357b079", - "text": "Fig. 3. Inhibition ef\ufb01ciency versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the presence of ES.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "21233d8e249dd8180c7f2c99a468f337", - "text": "number of inhibitor adsorbed on the surface of stainless steel at higher concentration, in order for the active sites of the stainless steel to be protected with the inhibitor molecules. Cathodic and anodic polarized potential are measured in the presence and absence of ES. Fig. 4 shows the cathodic and anodic polarization curves for stainless steel in 0.5 M H2SO4 solution at different ES concentrations. The electrochemical variables such as polarization resistance (PR), corrosion potential (Ecorr), cor- rosion current (icorr), anodic Tafel constant (ba), cathodic Tafel constant (bc) and corrosion rate (mm/ year) values are presented in Table 1. From the polarization curves and electrochemical parameter, icorr value decreased with the addition of inhibitor in 0.5 M H2SO4. Conversely, the icorr further decrease with an increase in inhibitor concentration indicating that the inhibition effects increase with an increase in the egg shell concentration. The process of egg shell inhibition could be attributed to the formation of egg shell powder adsorbed on stainless steel surface protecting corrosion of stainless steel in H2SO4 medium. The likely mechanism is the egg shell adsorption on stainless steel surface through the heteroatoms electron pair and the conjugated systems in egg shell molecular structure as shown in Fig. 1. When the concentration of inhibitor was increased from 2 to 10 g, the corrosion rate values drastically decreased this result show that waste egg shell powder is an effective corrosion inhibitor for stainless steel in H2SO4 solution. The shift in corrosion potential of stainless steel from Tafel curves and electrochemical data indicate that the inhibitor is a mixed-type corrosion inhibitor.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "443e25a2b54b8b2a43f8029e07f784b3", - "text": "453", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "33b112b0d8640ab4f13b22a2ee714086", - "text": "454", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "e87ca7b3cd075aaa0de8030768aca87c", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "fd8a0feb5e755ece5d9abceb844649ff", - "text": "Fig. 4. Anodic and cathodic polarization curve of stainless steel in 0.5 M H2SO4 solution in the presence and absence of ES.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "598ed0a58406fc921332297f345b177a", - "text": "Table 1 Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO4 solution.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "9620a738189422654c5456fa16e507e7", - "text": "Inhibitor concentration (g)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "3acf3c88a28cad76984ac041a8f5984c", - "text": "bc (V/dec)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "da72962f658cee29281fa0e11a548813", - "text": "ba (V/dec)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "63a8b6b360c7a61ef88ad6c0b3d6581d", - "text": "Ecorr (V)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "616ac8133f9b985812240add98badf5a", - "text": "icorr (A/cm2)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "5ef6c0b5c5c72f20a694c6bce97ed131", - "text": "Polarization resistance (\u03a9)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6eff2d13b846a74ce08e348c7151dd1c", - "text": "Corrosion rate (mm/year)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "4a00cd3d6d5f9b71b105586a17125069", - "text": "0 2 4 6 8 10", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "812204070320132126dcfec00abb07f7", - "text": "0.0335 1.9460 0.0163 0.3233 0.1240 0.0382", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "08c96eb52fe4877d6a26d862f8919d35", - "text": "0.0409 0.0596 0.2369 0.0540 0.0556 0.0086", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a0aa9bf2a48ed1dff882a16cb320c616", - "text": "(cid:3)0.9393 (cid:3)0.8276 (cid:3)0.8825 (cid:3)0.8027 (cid:3)0.5896 (cid:3)0.5356", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a725c31d8b684d978174d4dc11d29106", - "text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "f66516a9a89cb0ab07ccf9e15086f394", - "text": "24.0910 121.440 42.121 373.180 305.650 246.080", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a6663f53eba15d4c5596b1f8ec4208fd", - "text": "2.8163 1.5054 0.9476 0.4318 0.3772 0.0919", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "f5db77e611b74b7298f1b48a82ffc7be", - "text": "The plot of inhibitor concentration over degree of surface coverage versus inhibitor concentration gives a straight line as shown in Fig. 5. The strong correlation reveals that egg shell adsorption on stainless surface in 0.5 M H2SO4 follow Langmuir adsorption isotherm. Figs. 6\u20138 show the SEM/EDX surface morphology analysis of stainless steel. Figs. 7 and 8 are the SEM/EDX images of the stainless steel specimens without and with inhibitor after weight loss experiment in sulphuric acid medium. The stainless steel surface corrosion product layer in the absence of inhibitor was porous and as a result gives no corrosion protection. With the presence of ES, corrosion damage was minimized, with an evidence of ES present on the metal surface as shown in Fig. 8.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e4e5f97ab5b56767ed489d7cd3ee04f7", - "text": "12", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "afc0a737ef1e5ffa9d6b72bb32fef683", - "text": "C/0", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "d9a38658d857c1141618ad9115dc48b4", - "text": "10", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2d046240fd1a0ff3420926f0a54e0aaa", - "text": "8", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "4c136188f1e2e974ec1003968916824a", - "text": "0 / C", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "594366da1ff6e7a343ec1666c5852389", - "text": "6", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "d84c13ba166bd29d042db10acba6d243", - "text": "4", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "d4210b5ce6f99e242d8c1aa586691286", - "text": "2", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "7afb08e1cc308afebdc038fc7e4595ed", - "text": "2", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "696d24804069bc593dc624bf7ba904e2", - "text": "4", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "ef054383c29789c2743d93a6189f7f47", - "text": "6", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "ae2f6fc244a6aa053403e38912fdc56a", - "text": "8", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "33c153482d9c925a35781bd5c9697648", - "text": "10", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "8f325f6eb1678922e83e32746b981b80", - "text": "Concentration (g)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "9d46c2166a49c9e3a75ed98cb20ce13f", - "text": "Fig. 5. Langmuir adsorption isotherm of ES.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "9d639b03d26ec1872a4e91ac99031fdf", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 5, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "cfea47dcbf32f3d8597e777afa74d20e", - "text": "Fig. 6. SEM/EDX image of as-received stainless steel.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 5, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "a1e6c9bab7935444a7491a47091be10c", - "text": "Fig. 7. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution without inhibitor.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 5, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "49e093091da774c567151e5147c70027", - "text": "Fig. 8. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution with the presence of inhibitor.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 5, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "8ac2e9f97dc89f9d9bac5baec281f7f2", - "text": "455", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 5, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e303e27893be099ef5fd03235efee7fe", - "text": "456", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "91c8bf5283b45a71164a103f496f93c1", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "bffefa92b06bc6009f81965d3dadc0ce", - "text": "2. Experimental design, materials and methods", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "484707d26d81d85df99f322c1bbb8ca3", - "text": "2.1. Material", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "79d10fe9600d8d3428b5df86faa7c099", - "text": "Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [3\u20135]. The structural formula of egg shell powder is shown in Fig. 9.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b6bd160c80816ff7b2d8a36ccfc67568", - "text": "Fig. 9. Chemical structure of egg shell powder.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "aeafe864b565b167f053a348390b3eff", - "text": "2.2. Weight loss method", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "0e51f945cacb5ec184a3613487b6fefb", - "text": "This physical measurement was carried out in order to provide direct result on how the corrosive environment affects the test sample. The cleaned and weighed specimen was suspended in beakers with the aid of glass hooks and rods with the test solution of ES at different concentration (2, 4, 6, 8 and 10 g). The pre-weighed specimen was retrieved from the test solution after every 24 h, cleaned, dried and reweighed. The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss which was used to calculate corrosion rate and inhibition ef\ufb01ciency.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "fed48b9de93d4324223aa5fbdfe2f359", - "text": "The corrosion rate (CR) was calculated using Eq. (1) [1\u20135]", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2c4a913c3a4b8bccd9c7003f25ae25af", - "text": "(cid:1) \u00de \u00bc 87:6W DAT", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "902d0aabf523c467c200f5203957e606", - "text": "(cid:3)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "44d54b6fb44ac7afc9f40a0e7a5fcde3", - "text": "Corrosion rate CR\u00f0", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "7459b20ea68d65b7a967500f22223507", - "text": "where: W is weight loss in mg, A is specimen surface area, T is immersion period in hours and D is the specimen density. From the corrosion rate, the surface coverage (\u03b8) and inhibition ef\ufb01ciencies (IE %) were determined using Eqs. (2) and (3) respectively", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "543caecd15c161082076a174ea946782", - "text": "\u03b8 \u00bc CRo(cid:3)CR", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b2cc1eda5ffbccf6416235c44181538c", - "text": "CRo", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "59a609931ac8f9c55855113bfae6655e", - "text": "IE \u00f0%\u00de \u00bc CRo(cid:3)CR", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "3bf244c1b2eb32875b292a28c130aba4", - "text": "CRo", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2c6d5581a35c83236153f78c5b53cb60", - "text": "x", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "ca4aeca8c2a7e6b9df923db4a5902289", - "text": "100 1", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "a47048cff18528a9a4838728a55e526a", - "text": "where: CRo and CR are the corrosion rate in absence and presence of inhibitor respectively.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6aabbfd8e92223470a6c9184a84857c0", - "text": "2.3. Potentiodynamic polarization method", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c653c9cca5ebdd3089b705f279316500", - "text": "The potentiodynamic polarization method was performed on the prepared test samples immersed in 0.5 M H2SO4 solution in the presence and absence of different ES concentrations. A three electrode system was used; stainless steel Type 316 plate as working electrode with an exposed area of 1.0 cm2, platinum rod as counter electrode and silver chloride electrode as reference electrode. The electrode was polished, degreased in acetone and thoroughly rinsed with distilled water before the experiment. Current density against applied potential was plotted. The slope of the linear part in anodic and cathodic plots gives anodic and cathodic constants according to the Stern\u2013Geary equation, and the", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b1cdefa47658616bf79766f8fc353f7c", - "text": "\u00f01\u00de", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a1a035eeaa7c25a2b543757f4cc7d0fb", - "text": "\u00f02\u00de", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "74d17735c911d69b6d10e05d0c9d79d6", - "text": "\u00f03\u00de", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "e40c3ee561b10ca5b7a76900c8d5b263", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "ac11629522e563b6a0a8f261ab4b94e0", - "text": "steps of the linear polarization plot are substituted to get corrosion current. Nova software was used with linear polarization resistance (LPR) and the current was set to 10 mA (maximum) and 10 nA (minimum). LSV staircase parameter start potential (cid:3)1.5 v, step potential 0.001 m/s and stop potential of \u00fe1.5 v set was used in this study.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2461424bae61c8cfad1cd33a949843f0", - "text": "Acknowledgements", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "2d8a74bbba4ad3bb13afc8a98daec91d", - "text": "This work was supported by the National Research Foundation of South Africa and the Tshwane", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "154e2a7bdebd1347eccb08f349284130", - "text": "University of Technology Pretoria South Africa.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "41a46b0a6852a31b1e51cf65a4ecf87d", - "text": "Transparency document. Supporting information", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c5635281e7e879dd338b99ae84f94056", - "text": "Transparency document associated with this article can be found in the online version at https://doi.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// doi .", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 89 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "ee62928948d5d7b5e13edf65d917dc63", - "text": "org/10.1016/j.dib.2018.11.134.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "org / 10 . 1016 / j . dib . 2018 . 11 . 134", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 0 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "dbe83d8d2b6784a17d8faae3633b97f9", - "text": "References", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d08513d888e4133fda75841dd05273d9", - "text": "[1] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi , Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", - "start_index": 4 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "29736d79aeb1e5fc195876dbf12f1c57", - "text": "using eco-friendly waste product, Results Phys. 9 (2018) 225\u2013230.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "using eco - friendly waste product , Results Phys . 9 ( 2018 ) 225 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", - "start_index": 0 - }, - { - "text": "using eco - friendly waste product , Results Phys . 9 ( 2018 ) 225 \u2013 230 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", - "start_index": 0 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "ca40f2c0d5a95e8cddab1c3b76f95e9e", - "text": "[2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "O . Sanni , A . P . I . Popoola , A . Kolesnikov , Constitutive modeling for prediction of optimal process parameters in corrosion", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", - "start_index": 4 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "e42cb45853ffd3e2c81095a126918c6c", - "text": "inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1\u201315.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "inhibition of austenitic stainless steel ( Type 316 )/ acidic medium , Mater . Res . Express . 5 ( 10 ) ( 2018 ) 1 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", - "start_index": 0 - }, - { - "text": "inhibition of austenitic stainless steel ( Type 316 )/ acidic medium , Mater . Res . Express . 5 ( 10 ) ( 2018 ) 1 \u2013 15 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", - "start_index": 0 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "610ae41b07604b353631457b9a4ad632", - "text": "[3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi , The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", - "start_index": 4 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "ae14702f67ee1c5d2e5316e8344a6971", - "text": "corrosion in chloride solution, Def. Technol. 14 (2018) 463\u2013468.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "corrosion in chloride solution , Def . Technol . 14 ( 2018 ) 463 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", - "start_index": 0 - }, - { - "text": "corrosion in chloride solution , Def . Technol . 14 ( 2018 ) 463 \u2013 468 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", - "start_index": 0 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d1c8e3e15192f1bdcda9cf8e38a5573f", - "text": "[4] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1\u201317. https://doi.org/10.1007/ s13632-018-0495-5.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https", - "url": "https://doi.org/10.1007/s13632-018-0495-5", - "start_index": 233 - }, - { - "text": "https", - "url": "https://doi.org/10.1007/s13632-018-0495-5", - "start_index": 233 - }, - { - "text": "https :// doi . org / 10 . 1007 /", - "url": "https://doi.org/10.1007/s13632-018-0495-5", - "start_index": 233 - }, - { - "text": "s13632 - 018 - 0495 - 5", - "url": "https://doi.org/10.1007/s13632-018-0495-5", - "start_index": 258 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "3827d49ec98a215986f78d1df2ae2d33", - "text": "[5] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Inhibition of engineering material in sulphuric acid solution using waste product, Contributed Papers from Materials Science and Technology (MS&T18), 2018. \u2329https://doi.org/10.7449/2018/MST_2018_254_261\u232a.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// doi . org / 10 . 7449 / 2018 / MST _ 2018 _ 254 _ 261", - "url": "https://doi.org/10.7449/2018/MST_2018_254_261", - "start_index": 202 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "7fbcd3b873966a649efd837300e0c576", - "text": "457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json deleted file mode 100644 index 908e9e125a..0000000000 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json +++ /dev/null @@ -1,2514 +0,0 @@ -[ - { - "type": "Header", - "element_id": "d25e5f46b5be5f4c8a6573d0688dae93", - "text": "Data in Brief 22 (2019) 484\u2013487", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "Data in Brief 22 ( 2019 ) 484 \u2013 487", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 0 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "ffd4c08fe1f13ed4b1c1c523ead5510b", - "text": "Contents lists available at ScienceDirect", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "ScienceDirect", - "url": "www.sciencedirect.com/science/journal/23523409", - "start_index": 28 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "ab45cdb29d177758321b79d0e5430958", - "text": "Data in Brief", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b6ed6a9bb542e0891cebca3fa85e6bcd", - "text": "journal homepage: www.elsevier.com/locate/dib", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "www . elsevier . com / locate / dib", - "url": "www.elsevier.com/locate/dib", - "start_index": 18 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "1acc2228e407a58c34b39c30aed641fe", - "text": "Data Article", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "798dd79fdd2f8266cf92f28200198e08", - "text": "A benchmark dataset for the multiple depot vehicle scheduling problem", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "8edd00e1188d7cb75051b1998ee494a9", - "text": "Sarang Kulkarni a,b,c,n, Mohan Krishnamoorthy d,e, Abhiram Ranade f, Andreas T. Ernst c, Rahul Patil b", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "7d3eb41c30b752ac6026851e8119f642", - "text": "a IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India b SJM School of Management, IIT Bombay, Powai, Mumbai 400076, India c School of Mathematical Sciences, Monash University, Clayton, VIC 3800, Australia d Department of Mechanical and Aerospace Engineering, Monash University, Clayton, VIC 3800, Australia e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072, Australia f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "3f086bae7b6270727b6fca8ba4563fd7", - "text": "a r t i c l e i n f o", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "a951e8fba28630797a561ae24142f1b9", - "text": "a b s t r a c t", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "90549df65b3824f67f0290bc96644155", - "text": "Article history: Received 21 November 2018 Received in revised form 13 December 2018 Accepted 15 December 2018 Available online 18 December 2018", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "3e158fd01d34697ac14890732b84a1fc", - "text": "This data article presents a description of a benchmark dataset for the multiple depot vehicle scheduling problem (MDVSP). The MDVSP is to assign vehicles from different depots to timetabled trips to minimize the total cost of empty travel and waiting. The dataset has been developed to evaluate the heuristics of the MDVSP that are presented in \u201cA new formulation and a column generation-based heuristic for the multiple depot vehicle sche- duling problem\u201d (Kulkarni et al., 2018). The dataset contains 60 problem instances of varying size. Researchers can use the dataset to evaluate the future algorithms for the MDVSP and compare the performance with the existing algorithms. The dataset includes a program that can be used to generate new problem instances of the MDVSP.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "298de5d25d4db319d8cb1c4da4e14411", - "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "25ce21c9671271c1639f549d88644f16", - "text": "DOI of original article: https://doi.org/10.1016/j.trb.2018.11.007 n Corresponding author at: IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 . 007 n Corresponding author at", - "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", - "start_index": 25 - }, - { - "text": "https", - "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", - "start_index": 25 - }, - { - "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 . 007", - "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", - "start_index": 25 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b4b1b0bb1bf27aa4de6d404b9304fb02", - "text": "E-mail address: sarangkulkarni@iitb.ac.in (S. Kulkarni).", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "sarangkulkarni @ iitb . ac . in", - "url": "mailto:sarangkulkarni@iitb.ac.in", - "start_index": 16 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "3bf8a8c86295c8d68682ff1c4594b485", - "text": "https://doi.org/10.1016/j.dib.2018.12.055 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 0 - }, - { - "text": "https", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 0 - }, - { - "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 12 . 055", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 0 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "690f7bab68c635029827f497e6c2b218", - "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484\u2013487", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e93f43b23b30a616389e12f193fdf212", - "text": "485", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "8b5f19753e010793be1dd03a4efe1876", - "text": "Speci\ufb01cations table", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "b592fc872f2d852ad0242b2353e61673", - "text": "Subject area Operations research More speci\ufb01c subject area Vehicle scheduling Type of data How data were acquired", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d2073c6354217f9b2d4d5c654d77f232", - "text": "Tables, text \ufb01les Arti\ufb01cially generated by a C\u00fe \u00fe program on Intels Xeons CPU E5\u2013 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8,12,16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457\u2013487 [3].", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// orlib . uqcloud . net /", - "url": "https://orlib.uqcloud.net/", - "start_index": 383 - } - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "156810b54dfdfa06606b2ab9c20e5936", - "text": "Data format Experimental factors", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f10143ddfaeadcb83593edbd06f6dae5", - "text": "Experimental features Data source location Data accessibility Related research article", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "61e613d4cdb2f24fcb40060db45431c0", - "text": "Value of the data", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d0dfba5954b055b335476e9249b9a73c", - "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2956461e611848aeaccd16b99fc03400", - "text": "performance of the algorithms for the MDVSP.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "2f732a3a72336ba52b0b0de6d0008640", - "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "5bd31208ba63e7a44aeea1fd4d721d54", - "text": "mathematical formulations.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "038f53e4bdc8c6ea7b1c63f1b9a73e2f", - "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "15906f62459fa76ddadb7a7ef1ce556b", - "text": "be used for the comparison.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "4a39c62bb4f7476ec42fd81325ea6f19", - "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "414bd3131cd65d5c68e1c7a140297506", - "text": "1. Data", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "52c2b4b09c228b90a487fa4fd42a1590", - "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate \ufb01le. Each \ufb01le is named as \u2018RN-m-n-k.dat\u2019, where \u2018m\u2019, \u2018n\u2019, and \u2018k\u2019 denote the number of depots, the number of trips, and the instance number \u2018RN-8\u20131500-01.dat\u2019, for is the \ufb01rst problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, \u00f0m;n\u00de, \ufb01ve instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// orlib . uqcloud . net", - "url": "https://orlib.uqcloud.net", - "start_index": 609 - } - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a442f6b8548f2b2be7eb0b0c488eaf3f", - "text": "\u2018\u00f0m;n\u00de\u2019,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a1d0fff4ecc99ed0b3792f63af7ac732", - "text": "the size,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "18ddc61212b977693c3ab4a9e2a98213", - "text": "respectively. For example,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "f5af2f4ccedef8e9c9222943207ddce1", - "text": "the problem instance,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "20a5ace34ab61e08b1ab35c222c6554f", - "text": "For each problem instance, the following information is provided: The number of depots m\u00f0 The number of trips \u00f0n\u00de, The number of locations \u00f0l\u00de, The number of vehicles at each depot, For each trip iA1;2;\u2026;n, a start time, ts", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "f1d7de16fe466b5c9f0396600da6d3ef", - "text": "\u00de,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "d07db900a92fbc399e2eac5e0fc704ee", - "text": "i , a start location, ls", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "812eeb4f274baf14170f2447204a4a55", - "text": "i, an end time, te", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "4b917219b5939da4a52a907db733f551", - "text": "i, and an end location, le i ,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "84e91ae08f7e4ae8996bb4cdbbfb9e32", - "text": "and", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "b1bb94d45fba27ddeefd146fbde1dcc4", - "text": "(cid:2) The travel time, \u03b4ij, between any two locations i;jA1;\u2026;l.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "5e73cd663ab2449350114f86e23f6bbb", - "text": "All times are in minutes and integers. The planning duration is from 5 a.m. to around midnight. Each instance has two classes of trips, short trips and long trips, with 40% short trips and 60% long trips. The duration of a short trip is less than a total of 45 min and the travel time between the start", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "87149858e00c98f10a2b08be1b8d584a", - "text": "486", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "5fc26c03275c46c5eb2ae66c0c288d2b", - "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484\u2013487", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "eeba8dd874b520a36aa718db99dbfd38", - "text": "and end location of the trip. A long trip is about 3\u20135 h in duration and has the same start and end location. For all instances, mrl and the locations 1;\u2026;m correspond to depots, while the remaining locations only appear as trip start and end locations.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "36bb62577b390f929d88ed7d004c1e3e", - "text": "i \u00fe\u03b4", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c4a028a7e5a91a69b88a778ed1d4c4c1", - "text": ". If le i ls le i j, otherwise, the vehicle may require waiting at le i for the duration of \u00f0ts", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "3351f34f87afe9cffe4fd31320b9ccc8", - "text": "Zte", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "7a378649c353830c59db2e86df7f7368", - "text": "als", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "5066fe5d8ca5d5f91f7312ec35a9a7e8", - "text": "A trip j can be covered after trip i by the same vehicle, if ts j", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "f7296ef349382c5db6f8a271d8f3fe03", - "text": "j, the vehicle must travel empty from le j (cid:3)te i \u00de. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satis\ufb01ed:", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "871530d7bbaa529bbc177fc2a041720e", - "text": "j", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "bfd40d52e047822b7bc341a4741f1f73", - "text": "i to ls", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "a8f50afa154ed8c4545362eeb8ca5799", - "text": "1. Each schedule should start and end at the same depot. 2. Each trip should be covered by only one vehicle. 3. The number of schedules that start from a depot should not exceed the number of vehicles at the depot.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "3dbb489d8594d6744d2fce9cdcde691c", - "text": "A suf\ufb01cient number of vehicles are provided to maintain the feasibility of an instance. For each instance size \u00f0m;n\u00de, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over \ufb01ve instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "7490a379155c95007ad9649ec7689e35", - "text": "The description of the \ufb01le for each problem instance is presented in Table 2. The \ufb01rst line in the \ufb01le provides the number of depots \u00f0m\u00de, the number of trips, \u00f0n\u00de, and the number of locations \u00f0l\u00de, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, iA 1;\u2026;n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i;jA 1;\u2026;l", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "924fc12bebb375f9c74313489cf16217", - "text": "f", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "028c5c64e9591944e620e8308f516b5a", - "text": "(cid:1)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "ce73daceb6d992f6af62cceb4a3d424f", - "text": "(cid:3)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "4c3e98e95e0007df7a9e116f5df403c8", - "text": ".", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "0b37e732b73efa9dbd994f164dac8d5c", - "text": "The dataset also includes a program \u2018GenerateInstance.cpp\u2019 that can be used to generate new instances. The program takes three inputs, the number of depots \u00f0m\u00de, the number of trips \u00f0n\u00de, and the number of instances for each size \u00f0m;n\u00de.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "155c4752aa12e6b82164f5ac49103a19", - "text": "Table 1 Average number of locations, times, vehicles and empty travels for each instance size.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6d92abd137f1e1a6f7d9ecfa1edb0cf4", - "text": "Instance size (m, n)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "bcd163c5719297fd86b9eebacf8a9c24", - "text": "Average number of", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "204a9747099a8efd4aa0b05c9e5c38d2", - "text": "Locations", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "327cb3d0fb60857fee3d8f0c2c78d613", - "text": "Times", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6592bb72dcd3912aa6fabc3df525aeda", - "text": "Vehicles", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "80ce4476651a7ac735c554343aeb749f", - "text": "Possible empty travels", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "71a7492ba9c12eef52065aabaebc3a7c", - "text": "(8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "7701857f59bdba5844b24edc32749d05", - "text": "568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2bf95679e315fbbd9f0ceb0ce36d9197", - "text": "975.20 1048.00 1078.00 1113.20 994.00 1040.60 1081.00 1107.40 985.40 1040.60 1083.20 1101.60", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "da4ae500af3e46e7446a28cddd32679c", - "text": "652.20 857.20 1082.40 1272.80 642.00 861.20 1096.00 1286.20 667.80 872.40 1076.40 1284.60", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e21d6005188c8a7bfcb95e42868b986c", - "text": "668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "fa23407a7c3c99ae3b6fb79034698807", - "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484\u2013487", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "0a4152d3ee312a3d28cc2b63d6f59a6e", - "text": "Table 2 Description of \ufb01le format for each problem instance.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "d66486bdc6e5b4d6e2018f7da6d0b0d0", - "text": "Number of lines", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6c56043a98b068693db3cd6ded0bc020", - "text": "Number of columns in each line", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2fc6800b1896d3d2779ee6e98794bdb1", - "text": "Description", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a5efd069cfcb8d3c983dfab2b9336b0e", - "text": "1 1 n", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "1d96bbba9ffa9a12e81da0426f80a9fc", - "text": "l", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "25f80b4c6652f9af1a6883a6e4b8c0bb", - "text": "3 m 4", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "516ec572955aa07f031d27cc89008615", - "text": "l", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c981c256386d57e68a2c947147f30229", - "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i \u00bc 1;2;\u2026;n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, \u03b4ij; where i;jA1;2;\u2026;l, refers to the travel time between location i and location j.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "e6e8997790263be5ca103754ee56e234", - "text": "i, the start", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "49f536ed0f91f7e6d8ad1d70d71991b0", - "text": "i, the end location le", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "0f605e650a81abc6b5a30423d60d0975", - "text": "2. Experimental design, materials, and methods", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "37200c447b8f7e1443b707c1e76e66b0", - "text": "The procedure presented by Carpaneto et al. in [1] is used to generate the problem instances. The same procedure has been used by Pepin et al. in [4] to generate the benchmark dataset of the MDVSP. A detailed description of the procedure is presented in [3].", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "92e466c917445c0d473eea592acc3b72", - "text": "Our dataset provides start/end location and time of trips as well as the travel time between any two locations. The location and time information is required to model the MDVSP on a time-space network. The feasible connections and the cost of connections between the trips can be obtained as discussed in [3]. Thus, the dataset has all the information that is required to model the MDVSP on the time-space network (see [2]) as well as the connection-network (see [5]). The benchmark solutions for all the problem instances are presented in [3].", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d89dfb5247b731abfe90aedc46c09806", - "text": "Transparency document. Supporting information", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "9a157bb2a3ee3ac55ecf743df0020ce9", - "text": "Transparency document associated with this article can be found in the online version at https://doi.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// doi .", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 89 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "fb1ccb68103598fae7cc8128c97711d9", - "text": "org/10.1016/j.dib.2018.12.055.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "org / 10 . 1016 / j . dib . 2018 . 12 . 055", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a63064fd9987765c33c9d20047dc2f15", - "text": "References", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "909007a841d32eb20886f7fc2d923911", - "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "G . Carpaneto , M . Dell ' Amico , M . Fischetti , P . Toth , A branch and bound algorithm for the multiple depot vehicle scheduling", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", - "start_index": 4 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "b1902a32b19337484e93efd9509a07c1", - "text": "problem, Networks 19 (5) (1989) 531\u2013548.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "problem , Networks 19 ( 5 ) ( 1989 ) 531 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", - "start_index": 0 - }, - { - "text": "problem , Networks 19 ( 5 ) ( 1989 ) 531 \u2013 548 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "5a7cc4a5afb4c97c546a3b64cb4f593f", - "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time\u2013space network based exact optimization model for multi-depot bus scheduling, Eur.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "N . Kliewer , T . Mellouli , L . Suhl , A time \u2013 space network based exact optimization model for multi - depot bus scheduling , Eur .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", - "start_index": 4 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "6a1cb7145ede91c5d2e6bb53b4d59f65", - "text": "J. Oper. Res. 175 (3) (2006) 1616\u20131627.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "J . Oper . Res . 175 ( 3 ) ( 2006 ) 1616 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", - "start_index": 0 - }, - { - "text": "J . Oper . Res . 175 ( 3 ) ( 2006 ) 1616 \u2013 1627 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "439a02aad982d445100cc246cd066b53", - "text": "[3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "S . Kulkarni , M . Krishnamoorthy , A . Ranade , A . T . Ernst , R . Patil , A new formulation and a column generation - based heuristic", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", - "start_index": 4 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "46a8bd54aa6c1bd32118f4a681faaec9", - "text": "for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457\u2013487.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "for the multiple depot vehicle scheduling problem , Transp . Res . Part B Methodol . 118 ( 2018 ) 457 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", - "start_index": 0 - }, - { - "text": "for the multiple depot vehicle scheduling problem , Transp . Res . Part B Methodol . 118 ( 2018 ) 457 \u2013 487 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "f60e59177f5f0e53e3f285fa68a8e3ef", - "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of \ufb01ve heuristics for the multiple depot vehicle scheduling", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "A . S . Pepin , G . Desaulniers , A . Hertz , D . Huisman , A comparison of \ufb01ve heuristics for the multiple depot vehicle scheduling", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref4", - "start_index": 4 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "0f8229a10050ec65ae5b6f9f66c6ca47", - "text": "problem, J. Sched. 12 (1) (2009) 17.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "problem , J . Sched . 12 ( 1 ) ( 2009 ) 17 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref4", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "9f411677c0a8ddb06047e600b348e282", - "text": "[5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1)", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "C . C . Ribeiro , F . Soumis , A column generation approach to the multiple - depot vehicle scheduling problem , Oper . Res . 42 ( 1 )", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", - "start_index": 4 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e37f78c7271830eb805f560368fec7cc", - "text": "(1994) 41\u201352.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "( 1994 ) 41 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", - "start_index": 0 - }, - { - "text": "( 1994 ) 41 \u2013 52 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "94e316e08a4a19eed59d29d5d58703ce", - "text": "487", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json deleted file mode 100644 index ed4d55b17c..0000000000 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json +++ /dev/null @@ -1,310 +0,0 @@ -[ - { - "type": "Header", - "element_id": "13c2cd4a987063cb9fe6802f8d9d8bba", - "text": "S32", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "6e95de55fbc805ac11d5e168881e41eb", - "text": "ns; 40 mg/day=3.6%, p<0.05; 80 mg/day=4.9%, p<0.01; 120 mg/day=9.3%, p<0.001, PM dosing group: 20 mg/day=-0.4%, ns; 40 mg/day=2.8%, p<0.05: 80 mg/day=0.2%, ns; 160 mg/day=5.8%, p<0.05). There was no clear dose-dependent trend associated with nausea and RD was similar between AM and PM dosing group (AM dosing group: 20 mg/ day=0.2% ns; 40 mg/day=3.8%, p<0.05; 80 mg/day=3.8%, ns; 120 mg/ day=6.6%, ns, PM dosing group: 20 mg/day=-1.6%, ns; 40 mg/day=-1.7%, ns; 80 mg/day=5.5%, p<0.01; 160 mg/day=2.8%, ns). Discussion: The risk of adverse events in the treatment of schizophrenia with lurasidone can vary depending on the timing of administration. In particular, for akathisia and somnolence, the incidence risks were reduced when lurasidone was administered in PM. Unlike with AM administration, the dose-dependence in the risks of these adverse events were not observed in lurasidone PM administration. The timing of lurasidone administration could be considered in effort to minimize potential adverse events.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "c0ad446ac0e663713724aa5f42d20448", - "text": "S6. SLEEP ENDOPHENOTYPES OF SCHIZOPHRENIA: A HIGH-DENSITY EEG STUDY IN DRUG-NA\u00cfVE, FIRST EPISODE PSYCHOSIS PATIENTS", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "21facf77763c3e990a3db1b8626c133a", - "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D\u2019Agostino*3 1Faculty of Biomedical Sciences, Universit\u00e0 della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King\u2019s College London, England; 3Universit\u00e0 degli Studi di Milano, Italy", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "26b6989522e94c2c7ef5c2633e41cf72", - "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high\u2013density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1\u20134 Hz) was lower in FEP compared to HC but this difference didn\u2019t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Footer", - "element_id": "b38798d4ed1cda1c49ed2db924d40039", - "text": "SIRS 2020 Abstracts", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "6681a3fc2e2bbc7efabbf221baaeec6b", - "text": "Poster Session I", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "418368d1fe238e68fc6c8663f7485649", - "text": "Discussion: Our data confirm previous findings on reduced slow wave density in FEP, and expand them to acute subjects, before any treatment is prescribed. This is in line with available data on diffuse abnormalities of cortico-cortical and cortico-thalamic networks in these patients. Interestingly, our data also offer preliminary evidence that this deficit is specific for SCZ, as it appears to differentiate patients who developed SCZ from those with other diagnoses at follow-up. Given the traveling properties of slow waves, future research should establish their potential as markers of connectivity in SCZ.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2693595cd6fc5be02dc752b089f85eea", - "text": "S7. INVESTIGATING THE LINK BETWEEN THE PERIPHERAL ENDOCANNABINOID SYSTEM AND CENTRAL GLUTAMATERGIC NEUROTRANSMISSION IN EARLY PSYCHOSIS: A 7T-MRS STUDY", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "3f2d8de4445801a7562416267c06a877", - "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "741c946db28df5068fb60063dad37d27", - "text": "Background: Meta-analytic evidence showed increased levels of periph- eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their first non-affective psychotic episode (NA-FEP) and healthy controls. Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session. Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP Discussion: This study will help clarifying the contribution of peripheral endocannabinoid system to central brain mechanisms of key relevance for psychotic illness. It will also add further evidence on the limited literature on high-resolution characterisation of brain metabolites in early psychosis. Strengths of the study include: (i) use of high-field MRS, which allows the estimation of glutamate-related compounds at higher precision than at lower field strength; (ii) reduced heterogeneity of the clinical sample (only male and NA-FEP). Limitations: small sample size and cross-sectional design.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "c1543aee0d7efb59052757f7b83a70a9", - "text": "S8. GRIN1 PROMOTER METHYLATION CHANGES IN BLOOD OF EARLY-ONSET PSYCHOTIC PATIENTS AND UNAFFECTED SIBLINGS WITH CHILDHOOD TRAUMA", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "5afb27a02de3e7a95c0f2fa442e32526", - "text": "Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "0d80b62dd72121dd5263df8605849cf4", - "text": "AQ3", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json b/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json index b2b5163a55..4ab84f68e4 100644 --- a/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json +++ b/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json @@ -2,9 +2,9 @@ { "type": "Table", "element_id": "dd7ef5654ad25579067c5f95d3515acf", - "text": "Release Year Title Origin/Ethnicity Director Cast Genre Wiki Page Plot 1901 Kansas Saloon Smashers American Unknown unknown https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1] 1901 Love by the Light of the Moon American Unknown unknown https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better. 1901 The Martyred Presidents American Unknown unknown https://en.wikipedia.org/wiki/The_Martyred_Presidents The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents\u2014Abraham Lincoln, James A. Garfield, and William McKinley\u2014each victims of assassination.\\r\\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice. 1901 Terrible Teddy, the Grizzly King American Unknown unknown https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading \"His Photographer\" and \"His Press Agent\" respectively, follow him into the shot; the photographer sets up his camera. \"Teddy\" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. \"Teddy\" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. \"Teddy\" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs. 1902 Jack and the Beanstalk American George S. Fleming, Edwin S. Porter unknown https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film) The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince. 1903 Alice in Wonderland American Cecil Hepworth May Clark unknown https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film) Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"\\r\\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.\\r\\nThe Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream. 1903 The Great Train Robbery American Edwin S. Porter western https://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film) The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the bandits\u200d\u2014\u200cnow four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.\\r\\nMeanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.\\r\\nThere is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail. 1904 The Suburbanite American Wallace McCutcheon comedy https://en.wikipedia.org/wiki/The_Suburbanite The film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest. 1905 The Little Train Robbery American Edwin Stanton Porter unknown https://en.wikipedia.org/wiki/The_Little_Train_Robbery The opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The \"Bandit Queen,\" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The \"Bandit Queen\" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\\r\\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the \"valuables,\" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\\r\\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the \"plunder.\" The police, however, have struck the right trail and are in close pursuit. While the \"plunder\" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the \"Bandit Queen.\" 1905 The Night Before Christmas American Edwin Stanton Porter unknown https://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film) Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents. 1906 Dream of a Rarebit Fiend American Wallace McCutcheon and Edwin S. Porter short https://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film) The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed. 1906 From Leadville to Aspen: A Hold-Up in the Rockies American Francis J. Marion and Wallace McCutcheon short action/crime western https://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_Rockies The film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart. 1906 Kathleen Mavourneen American Edwin S. Porter short film https://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film) Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\\r\\nFilm historian Charles Musser writes of Porter's adaptation, \"O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.\"[1] 1907 Daniel Boone American Wallace McCutcheon and Ediwin S. Porter William Craven, Florence Lawrence biographical https://en.wikipedia.org/wiki/Daniel_Boone_(1907_film) Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2] 1907 How Brown Saw the Baseball Game American Unknown Unknown comedy https://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_Game Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1] 1907 Laughing Gas American Edwin Stanton Porter Bertha Regustus, Edward Boulden comedy https://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_Film The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers. 1908 The Adventures of Dollie American D. W. Griffith Arthur V. Johnson, Linda Arvidson drama https://en.wikipedia.org/wiki/The_Adventures_of_Dollie On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents. 1908 The Black Viper American D. W. Griffith D. W. Griffith drama https://en.wikipedia.org/wiki/The_Black_Viper A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house. 1908 A Calamitous Elopement American D.W. Griffith Harry Solter, Linda Arvidson comedy https://en.wikipedia.org/wiki/A_Calamitous_Elopement A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings. 1908 The Call of the Wild American D. W. Griffith Charles Inslee adventure https://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film) A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\" 1908 A Christmas Carol American Unknown Tom Ricketts drama https://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film) No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life. 1908 The Fight for Freedom American D. W. Griffith Florence Auer, John G. Adolfi western https://en.wikipedia.org/wiki/The_Fight_for_Freedom The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.", + "text": "Release Year Title Origin/Ethnicity Director Cast Genre Wiki Page Plot 1901 Kansas Saloon Smashers American Unknown unknown https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1] 1901 Love by the Light of the Moon American Unknown unknown https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better. 1901 The Martyred Presidents American Unknown unknown https://en.wikipedia.org/wiki/The_Martyred_Presidents The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\\r\\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice. 1901 Terrible Teddy, the Grizzly King American Unknown unknown https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading \"His Photographer\" and \"His Press Agent\" respectively, follow him into the shot; the photographer sets up his camera. \"Teddy\" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. \"Teddy\" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. \"Teddy\" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs. 1902 Jack and the Beanstalk American George S. Fleming, Edwin S. Porter unknown https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film) The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince. 1903 Alice in Wonderland American Cecil Hepworth May Clark unknown https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film) Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"\\r\\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.\\r\\nThe Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream. 1903 The Great Train Robbery American Edwin S. Porter western https://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film) The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the banditsâ€â€”‌now four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.\\r\\nMeanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.\\r\\nThere is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail. 1904 The Suburbanite American Wallace McCutcheon comedy https://en.wikipedia.org/wiki/The_Suburbanite The film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest. 1905 The Little Train Robbery American Edwin Stanton Porter unknown https://en.wikipedia.org/wiki/The_Little_Train_Robbery The opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The \"Bandit Queen,\" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The \"Bandit Queen\" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\\r\\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the \"valuables,\" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\\r\\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the \"plunder.\" The police, however, have struck the right trail and are in close pursuit. While the \"plunder\" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the \"Bandit Queen.\" 1905 The Night Before Christmas American Edwin Stanton Porter unknown https://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film) Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents. 1906 Dream of a Rarebit Fiend American Wallace McCutcheon and Edwin S. Porter short https://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film) The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed. 1906 From Leadville to Aspen: A Hold-Up in the Rockies American Francis J. Marion and Wallace McCutcheon short action/crime western https://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_Rockies The film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart. 1906 Kathleen Mavourneen American Edwin S. Porter short film https://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film) Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\\r\\nFilm historian Charles Musser writes of Porter's adaptation, \"O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.\"[1] 1907 Daniel Boone American Wallace McCutcheon and Ediwin S. Porter William Craven, Florence Lawrence biographical https://en.wikipedia.org/wiki/Daniel_Boone_(1907_film) Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2] 1907 How Brown Saw the Baseball Game American Unknown Unknown comedy https://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_Game Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1] 1907 Laughing Gas American Edwin Stanton Porter Bertha Regustus, Edward Boulden comedy https://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_Film The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers. 1908 The Adventures of Dollie American D. W. Griffith Arthur V. Johnson, Linda Arvidson drama https://en.wikipedia.org/wiki/The_Adventures_of_Dollie On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents. 1908 The Black Viper American D. W. Griffith D. W. Griffith drama https://en.wikipedia.org/wiki/The_Black_Viper A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house. 1908 A Calamitous Elopement American D.W. Griffith Harry Solter, Linda Arvidson comedy https://en.wikipedia.org/wiki/A_Calamitous_Elopement A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings. 1908 The Call of the Wild American D. W. Griffith Charles Inslee adventure https://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film) A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\" 1908 A Christmas Carol American Unknown Tom Ricketts drama https://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film) No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life. 1908 The Fight for Freedom American D. W. Griffith Florence Auer, John G. Adolfi western https://en.wikipedia.org/wiki/The_Fight_for_Freedom The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.", "metadata": { - "text_as_html": "
    Release YearTitleOrigin/EthnicityDirectorCastGenreWiki PagePlot
    1901Kansas Saloon SmashersAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Kansas_Saloon_SmashersA bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]
    1901Love by the Light of the MoonAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Love_by_the_Light_of_the_MoonThe moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better.
    1901The Martyred PresidentsAmericanUnknownunknownhttps://en.wikipedia.org/wiki/The_Martyred_PresidentsThe film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents\u2014Abraham Lincoln, James A. Garfield, and William McKinley\u2014each victims of assassination.\\r\\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice.
    1901Terrible Teddy, the Grizzly KingAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_KingLasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading \"His Photographer\" and \"His Press Agent\" respectively, follow him into the shot; the photographer sets up his camera. \"Teddy\" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. \"Teddy\" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. \"Teddy\" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs.
    1902Jack and the BeanstalkAmericanGeorge S. Fleming, Edwin S. Porterunknownhttps://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film)The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince.
    1903Alice in WonderlandAmericanCecil HepworthMay Clarkunknownhttps://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film)Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"\\r\\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.\\r\\nThe Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream.
    1903The Great Train RobberyAmericanEdwin S. Porterwesternhttps://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film)The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the bandits\u200d\u2014\u200cnow four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.\\r\\nMeanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.\\r\\nThere is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail.
    1904The SuburbaniteAmericanWallace McCutcheoncomedyhttps://en.wikipedia.org/wiki/The_SuburbaniteThe film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest.
    1905The Little Train RobberyAmericanEdwin Stanton Porterunknownhttps://en.wikipedia.org/wiki/The_Little_Train_RobberyThe opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The \"Bandit Queen,\" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The \"Bandit Queen\" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\\r\\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the \"valuables,\" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\\r\\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the \"plunder.\" The police, however, have struck the right trail and are in close pursuit. While the \"plunder\" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the \"Bandit Queen.\"
    1905The Night Before ChristmasAmericanEdwin Stanton Porterunknownhttps://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film)Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents.
    1906Dream of a Rarebit FiendAmericanWallace McCutcheon and Edwin S. Portershorthttps://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film)The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed.
    1906From Leadville to Aspen: A Hold-Up in the RockiesAmericanFrancis J. Marion and Wallace McCutcheonshort action/crime westernhttps://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_RockiesThe film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart.
    1906Kathleen MavourneenAmericanEdwin S. Portershort filmhttps://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film)Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\\r\\nFilm historian Charles Musser writes of Porter's adaptation, \"O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.\"[1]
    1907Daniel BooneAmericanWallace McCutcheon and Ediwin S. PorterWilliam Craven, Florence Lawrencebiographicalhttps://en.wikipedia.org/wiki/Daniel_Boone_(1907_film)Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2]
    1907How Brown Saw the Baseball GameAmericanUnknownUnknowncomedyhttps://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_GameBefore heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1]
    1907Laughing GasAmericanEdwin Stanton PorterBertha Regustus, Edward Bouldencomedyhttps://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_FilmThe plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers.
    1908The Adventures of DollieAmericanD. W. GriffithArthur V. Johnson, Linda Arvidsondramahttps://en.wikipedia.org/wiki/The_Adventures_of_DollieOn a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents.
    1908The Black ViperAmericanD. W. GriffithD. W. Griffithdramahttps://en.wikipedia.org/wiki/The_Black_ViperA thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house.
    1908A Calamitous ElopementAmericanD.W. GriffithHarry Solter, Linda Arvidsoncomedyhttps://en.wikipedia.org/wiki/A_Calamitous_ElopementA young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings.
    1908The Call of the WildAmericanD. W. GriffithCharles Insleeadventurehttps://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film)A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\"
    1908A Christmas CarolAmericanUnknownTom Rickettsdramahttps://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film)No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life.
    1908The Fight for FreedomAmericanD. W. GriffithFlorence Auer, John G. Adolfiwesternhttps://en.wikipedia.org/wiki/The_Fight_for_FreedomThe film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.
    ", + "text_as_html": "
    Release YearTitleOrigin/EthnicityDirectorCastGenreWiki PagePlot
    1901Kansas Saloon SmashersAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Kansas_Saloon_SmashersA bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]
    1901Love by the Light of the MoonAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Love_by_the_Light_of_the_MoonThe moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better.
    1901The Martyred PresidentsAmericanUnknownunknownhttps://en.wikipedia.org/wiki/The_Martyred_PresidentsThe film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\\r\\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice.
    1901Terrible Teddy, the Grizzly KingAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_KingLasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading \"His Photographer\" and \"His Press Agent\" respectively, follow him into the shot; the photographer sets up his camera. \"Teddy\" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. \"Teddy\" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. \"Teddy\" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs.
    1902Jack and the BeanstalkAmericanGeorge S. Fleming, Edwin S. Porterunknownhttps://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film)The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince.
    1903Alice in WonderlandAmericanCecil HepworthMay Clarkunknownhttps://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film)Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"\\r\\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.\\r\\nThe Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream.
    1903The Great Train RobberyAmericanEdwin S. Porterwesternhttps://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film)The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the banditsâ€â€”‌now four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.\\r\\nMeanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.\\r\\nThere is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail.
    1904The SuburbaniteAmericanWallace McCutcheoncomedyhttps://en.wikipedia.org/wiki/The_SuburbaniteThe film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest.
    1905The Little Train RobberyAmericanEdwin Stanton Porterunknownhttps://en.wikipedia.org/wiki/The_Little_Train_RobberyThe opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The \"Bandit Queen,\" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The \"Bandit Queen\" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\\r\\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the \"valuables,\" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\\r\\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the \"plunder.\" The police, however, have struck the right trail and are in close pursuit. While the \"plunder\" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the \"Bandit Queen.\"
    1905The Night Before ChristmasAmericanEdwin Stanton Porterunknownhttps://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film)Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents.
    1906Dream of a Rarebit FiendAmericanWallace McCutcheon and Edwin S. Portershorthttps://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film)The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed.
    1906From Leadville to Aspen: A Hold-Up in the RockiesAmericanFrancis J. Marion and Wallace McCutcheonshort action/crime westernhttps://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_RockiesThe film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart.
    1906Kathleen MavourneenAmericanEdwin S. Portershort filmhttps://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film)Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\\r\\nFilm historian Charles Musser writes of Porter's adaptation, \"O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.\"[1]
    1907Daniel BooneAmericanWallace McCutcheon and Ediwin S. PorterWilliam Craven, Florence Lawrencebiographicalhttps://en.wikipedia.org/wiki/Daniel_Boone_(1907_film)Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2]
    1907How Brown Saw the Baseball GameAmericanUnknownUnknowncomedyhttps://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_GameBefore heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1]
    1907Laughing GasAmericanEdwin Stanton PorterBertha Regustus, Edward Bouldencomedyhttps://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_FilmThe plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers.
    1908The Adventures of DollieAmericanD. W. GriffithArthur V. Johnson, Linda Arvidsondramahttps://en.wikipedia.org/wiki/The_Adventures_of_DollieOn a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents.
    1908The Black ViperAmericanD. W. GriffithD. W. Griffithdramahttps://en.wikipedia.org/wiki/The_Black_ViperA thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house.
    1908A Calamitous ElopementAmericanD.W. GriffithHarry Solter, Linda Arvidsoncomedyhttps://en.wikipedia.org/wiki/A_Calamitous_ElopementA young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings.
    1908The Call of the WildAmericanD. W. GriffithCharles Insleeadventurehttps://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film)A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\"
    1908A Christmas CarolAmericanUnknownTom Rickettsdramahttps://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film)No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life.
    1908The Fight for FreedomAmericanD. W. GriffithFlorence Auer, John G. Adolfiwesternhttps://en.wikipedia.org/wiki/The_Fight_for_FreedomThe film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.
    ", "languages": [ "eng" ], diff --git a/test_unstructured_ingest/src/against-api.sh b/test_unstructured_ingest/src/against-api.sh index 7f2d6a9446..9667e01a1d 100755 --- a/test_unstructured_ingest/src/against-api.sh +++ b/test_unstructured_ingest/src/against-api.sh @@ -37,11 +37,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --chunking-strategy by_page \ --chunk-max-characters 10000 \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --num-processes "$max_processes" \ --input-path "example-docs/pdf/$TEST_FILE_NAME" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" RESULT_FILE_PATH="$OUTPUT_DIR/$TEST_FILE_NAME.json" # validate that there is at least one table with text_as_html in the results diff --git a/test_unstructured_ingest/src/airtable-diff.sh b/test_unstructured_ingest/src/airtable-diff.sh index 3cd81eff77..3fd3005c29 100755 --- a/test_unstructured_ingest/src/airtable-diff.sh +++ b/test_unstructured_ingest/src/airtable-diff.sh @@ -45,8 +45,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --work-dir "$WORK_DIR" \ - --verbose + --verbose \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/airtable-large.sh b/test_unstructured_ingest/src/airtable-large.sh index c0bf06fe4e..0e60199591 100755 --- a/test_unstructured_ingest/src/airtable-large.sh +++ b/test_unstructured_ingest/src/airtable-large.sh @@ -48,8 +48,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" # We are expecting fifteen directories: fourteen bases and the parent directory "$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/src/astradb.sh b/test_unstructured_ingest/src/astradb.sh index 1ea211a6bb..1b7843be49 100755 --- a/test_unstructured_ingest/src/astradb.sh +++ b/test_unstructured_ingest/src/astradb.sh @@ -34,8 +34,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/azure.sh b/test_unstructured_ingest/src/azure.sh index 6744805d6b..9c64353e9a 100755 --- a/test_unstructured_ingest/src/azure.sh +++ b/test_unstructured_ingest/src/azure.sh @@ -30,11 +30,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --account-name azureunstructured1 \ --remote-url abfs://container1/ \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/biomed-api.sh b/test_unstructured_ingest/src/biomed-api.sh index 82b29f887a..d8a0c6001e 100755 --- a/test_unstructured_ingest/src/biomed-api.sh +++ b/test_unstructured_ingest/src/biomed-api.sh @@ -33,12 +33,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --preserve-downloads \ --re-download \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --api-from "2019-01-02" \ --api-until "2019-01-02+00:03:10" \ --max-request-time 30 \ --max-retries 5 \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/biomed-path.sh b/test_unstructured_ingest/src/biomed-path.sh index 12401ed8ab..b45106b506 100755 --- a/test_unstructured_ingest/src/biomed-path.sh +++ b/test_unstructured_ingest/src/biomed-path.sh @@ -32,11 +32,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --max-request-time 30 \ --max-retries 5 \ --path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/box.sh b/test_unstructured_ingest/src/box.sh index 3ab2f44b46..6ff5a3dc96 100755 --- a/test_unstructured_ingest/src/box.sh +++ b/test_unstructured_ingest/src/box.sh @@ -39,18 +39,21 @@ if [ -z "$BOX_APP_CONFIG_PATH" ]; then fi RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} + +# shellcheck disable=SC2046 PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ box \ --download-dir "$DOWNLOAD_DIR" \ - --box-app-config "$BOX_APP_CONFIG_PATH" \ + --box-app-config $(cat "$BOX_APP_CONFIG_PATH") \ --remote-url box://utic-test-ingest-fixtures \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --num-processes "$max_processes" \ --preserve-downloads \ --recursive \ --reprocess \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/confluence-diff.sh b/test_unstructured_ingest/src/confluence-diff.sh index dc0f71cd12..f69f90f4e8 100755 --- a/test_unstructured_ingest/src/confluence-diff.sh +++ b/test_unstructured_ingest/src/confluence-diff.sh @@ -39,12 +39,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --url https://unstructured-ingest-test.atlassian.net \ --user-email "$CONFLUENCE_USER_EMAIL" \ --api-token "$CONFLUENCE_API_TOKEN" \ --spaces testteamsp,MFS \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/confluence-large.sh b/test_unstructured_ingest/src/confluence-large.sh index 790d675b9d..41ac1e3c46 100755 --- a/test_unstructured_ingest/src/confluence-large.sh +++ b/test_unstructured_ingest/src/confluence-large.sh @@ -45,7 +45,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --url https://unstructured-ingest-test.atlassian.net \ --user-email "$CONFLUENCE_USER_EMAIL" \ @@ -53,7 +52,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --max-num-of-spaces 10 \ --spaces testteamsp1 \ --max-num-of-docs-from-each-space 250 \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" OUTPUT_SUBFOLDER_NAME=testteamsp1 diff --git a/test_unstructured_ingest/src/delta-table.sh b/test_unstructured_ingest/src/delta-table.sh index d8ac971456..6fcf0c8cd0 100755 --- a/test_unstructured_ingest/src/delta-table.sh +++ b/test_unstructured_ingest/src/delta-table.sh @@ -38,10 +38,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.date_created,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --download-dir "$DOWNLOAD_DIR" \ --table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \ - --output-dir "$OUTPUT_DIR" \ --storage_options "{\"AWS_REGION\":\"us-east-2\",\"AWS_ACCESS_KEY_ID\":\"$AWS_ACCESS_KEY_ID\",\"AWS_SECRET_ACCESS_KEY\":\"$AWS_SECRET_ACCESS_KEY\"}" \ --preserve-downloads \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/discord.sh b/test_unstructured_ingest/src/discord.sh index ca986e3b0a..e074a145d8 100755 --- a/test_unstructured_ingest/src/discord.sh +++ b/test_unstructured_ingest/src/discord.sh @@ -37,10 +37,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --download-dir "$DOWNLOAD_DIR" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --channels 1099442333440802930,1099601456321003600 \ --token "$DISCORD_TOKEN" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/dropbox.sh b/test_unstructured_ingest/src/dropbox.sh index ff2c82998f..5d53e11c57 100755 --- a/test_unstructured_ingest/src/dropbox.sh +++ b/test_unstructured_ingest/src/dropbox.sh @@ -42,11 +42,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --token "$DROPBOX_ACCESS_TOKEN" \ --recursive \ --remote-url "dropbox://test-input/" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/elasticsearch.sh b/test_unstructured_ingest/src/elasticsearch.sh index 9141cde57f..2596eefabd 100755 --- a/test_unstructured_ingest/src/elasticsearch.sh +++ b/test_unstructured_ingest/src/elasticsearch.sh @@ -45,7 +45,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --index-name movies \ --hosts http://localhost:9200 \ @@ -53,6 +52,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --password "$ELASTIC_PASSWORD" \ --fields 'ethnicity,director,plot' \ --work-dir "$WORK_DIR" \ - --batch-size 2 + --batch-size 2 \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/gcs.sh b/test_unstructured_ingest/src/gcs.sh index 5261c11697..a3dec286ca 100755 --- a/test_unstructured_ingest/src/gcs.sh +++ b/test_unstructured_ingest/src/gcs.sh @@ -42,11 +42,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \ --recursive \ --remote-url gs://utic-test-ingest-fixtures/ \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/github.sh b/test_unstructured_ingest/src/github.sh index bea75f3590..87158195d3 100755 --- a/test_unstructured_ingest/src/github.sh +++ b/test_unstructured_ingest/src/github.sh @@ -29,7 +29,7 @@ GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none} ACCESS_TOKEN_FLAGS="" # to update test fixtures, "export OVERWRITE_FIXTURES=true" and rerun this script if [[ "$GH_READ_ONLY_ACCESS_TOKEN" != "none" ]]; then - ACCESS_TOKEN_FLAGS="--git-access-token $GH_READ_ONLY_ACCESS_TOKEN" + ACCESS_TOKEN_FLAGS="--access-token $GH_READ_ONLY_ACCESS_TOKEN" elif [[ "$CI" == "true" ]]; then echo "Warning: GH_READ_ONLY_ACCESS_TOKEN is not defined in the CI environment." echo "This can lead to intermittent failures in test-ingest-github.sh, as non-auth'ed" @@ -47,11 +47,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --url dcneiner/Downloadify \ - --git-file-glob '*.html,*.txt' \ + --file-glob '*.html,*.txt' \ --work-dir "$WORK_DIR" \ - $ACCESS_TOKEN_FLAGS + $ACCESS_TOKEN_FLAGS \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/gitlab.sh b/test_unstructured_ingest/src/gitlab.sh index 1bd01b4882..4bbed043fc 100755 --- a/test_unstructured_ingest/src/gitlab.sh +++ b/test_unstructured_ingest/src/gitlab.sh @@ -33,11 +33,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --git-branch 'v0.0.7' \ --git-file-glob '*.md,*.txt' \ --url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/google-drive.sh b/test_unstructured_ingest/src/google-drive.sh index 7e580e8a19..a1bc46d3a4 100755 --- a/test_unstructured_ingest/src/google-drive.sh +++ b/test_unstructured_ingest/src/google-drive.sh @@ -44,13 +44,14 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --drive-id 1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr \ --service-account-key-path "$GCP_INGEST_SERVICE_KEY_FILE" \ --recursive \ --extensions "pdf,docx" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/hubspot.sh b/test_unstructured_ingest/src/hubspot.sh index d5b617569a..d4ed043c3c 100755 --- a/test_unstructured_ingest/src/hubspot.sh +++ b/test_unstructured_ingest/src/hubspot.sh @@ -45,12 +45,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ - --output-dir "$OUTPUT_DIR" \ --api-token "$HUBSPOT_API_TOKEN" \ --object-types "calls,communications,emails,notes,products,tickets" \ --custom-properties '{"products":["my_custom_property"],"tickets":["another_custom_property"]}' \ --work-dir "$WORK_DIR" \ --preserve-downloads \ - --verbose + --verbose \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/jira.sh b/test_unstructured_ingest/src/jira.sh index ce6b4e0494..8e11647b71 100755 --- a/test_unstructured_ingest/src/jira.sh +++ b/test_unstructured_ingest/src/jira.sh @@ -58,7 +58,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --url https://unstructured-jira-connector-test.atlassian.net \ --user-email "$JIRA_INGEST_USER_EMAIL" \ @@ -66,6 +65,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --projects "JCTP3" \ --boards "1" \ --issues "JCTP2-4,JCTP2-7,JCTP2-8,10012,JCTP2-11" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/kafka-local.sh b/test_unstructured_ingest/src/kafka-local.sh index 36b21754fa..9e78fba544 100755 --- a/test_unstructured_ingest/src/kafka-local.sh +++ b/test_unstructured_ingest/src/kafka-local.sh @@ -67,10 +67,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --port 29092 \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --work-dir "$WORK_DIR" \ - --confluent false + --confluent false \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local-embed-bedrock.sh b/test_unstructured_ingest/src/local-embed-bedrock.sh index 285d15a56c..da4ee60c46 100755 --- a/test_unstructured_ingest/src/local-embed-bedrock.sh +++ b/test_unstructured_ingest/src/local-embed-bedrock.sh @@ -29,14 +29,15 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ --embedding-provider "aws-bedrock" \ --embedding-aws-access-key-id "$AWS_ACCESS_KEY_ID" \ - --embedding-aws-secret-access-key "$AWS_SECRET_ACCESS_KEY" + --embedding-aws-secret-access-key "$AWS_SECRET_ACCESS_KEY" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-embed-mixedbreadai.sh b/test_unstructured_ingest/src/local-embed-mixedbreadai.sh index 99168d7ddc..91823d0e9a 100755 --- a/test_unstructured_ingest/src/local-embed-mixedbreadai.sh +++ b/test_unstructured_ingest/src/local-embed-mixedbreadai.sh @@ -28,14 +28,15 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.record_locator.path,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ --embedding-provider "mixedbread-ai" \ --embedding-api-key "$MXBAI_API_KEY" \ - --embedding-model-name "mixedbread-ai/mxbai-embed-large-v1" + --embedding-model-name "mixedbread-ai/mxbai-embed-large-v1" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-embed-octoai.sh b/test_unstructured_ingest/src/local-embed-octoai.sh index 54ff3e2a08..92291ae8db 100755 --- a/test_unstructured_ingest/src/local-embed-octoai.sh +++ b/test_unstructured_ingest/src/local-embed-octoai.sh @@ -30,13 +30,14 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ --embedding-provider "octoai" \ - --embedding-api-key "$OCTOAI_API_KEY" + --embedding-api-key "$OCTOAI_API_KEY" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-embed-vertexai.sh b/test_unstructured_ingest/src/local-embed-vertexai.sh index 4ef499bc5b..a83dd798f2 100755 --- a/test_unstructured_ingest/src/local-embed-vertexai.sh +++ b/test_unstructured_ingest/src/local-embed-vertexai.sh @@ -30,14 +30,15 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ --embedding-provider "vertexai" \ --embedding-api-key "$GCP_INGEST_SERVICE_KEY" \ - --embedding-model-name "textembedding-gecko@001" + --embedding-model-name "textembedding-gecko@001" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-embed-voyageai.sh b/test_unstructured_ingest/src/local-embed-voyageai.sh index 83fe3586a4..7eea0c9e0e 100755 --- a/test_unstructured_ingest/src/local-embed-voyageai.sh +++ b/test_unstructured_ingest/src/local-embed-voyageai.sh @@ -30,14 +30,15 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ --embedding-provider "voyageai" \ --embedding-api-key "$VOYAGE_API_KEY" \ - --embedding-model-name "voyage-3-large" + --embedding-model-name "voyage-3-large" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-embed.sh b/test_unstructured_ingest/src/local-embed.sh index 210a7111c2..3d25844095 100755 --- a/test_unstructured_ingest/src/local-embed.sh +++ b/test_unstructured_ingest/src/local-embed.sh @@ -24,12 +24,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ - --embedding-provider "huggingface" + --embedding-provider "huggingface" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-failed-partition.sh b/test_unstructured_ingest/src/local-failed-partition.sh index a230888b30..976693433b 100755 --- a/test_unstructured_ingest/src/local-failed-partition.sh +++ b/test_unstructured_ingest/src/local-failed-partition.sh @@ -45,9 +45,10 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --strategy fast \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --input-path "$SCRIPT_DIR"/failed-partition-docs \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" check diff --git a/test_unstructured_ingest/src/local-single-file-basic-chunking.sh b/test_unstructured_ingest/src/local-single-file-basic-chunking.sh index 575bd876f8..12da9e1dde 100755 --- a/test_unstructured_ingest/src/local-single-file-basic-chunking.sh +++ b/test_unstructured_ingest/src/local-single-file-basic-chunking.sh @@ -31,10 +31,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --input-path "$ABS_INPUT_PATH" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ --reprocess \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh index 051c5fba29..fc8b0a41df 100755 --- a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh +++ b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh @@ -43,10 +43,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --input-path "$ABS_INPUT_PATH" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ --reprocess \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local-single-file-with-encoding.sh b/test_unstructured_ingest/src/local-single-file-with-encoding.sh index 3cf91223e5..9034abcfbd 100755 --- a/test_unstructured_ingest/src/local-single-file-with-encoding.sh +++ b/test_unstructured_ingest/src/local-single-file-with-encoding.sh @@ -25,12 +25,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --encoding cp1252 \ --verbose \ --reprocess \ --input-path example-docs/fake-html-cp1252.html \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh b/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh index 4c0ab5b36d..1597ffe83a 100755 --- a/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh +++ b/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh @@ -25,13 +25,14 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --skip-infer-table-types "xls,xlsx" \ --strategy hi_res \ --verbose \ --reprocess \ --input-path "$SCRIPT_DIR"/example-docs/ \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local-single-file.sh b/test_unstructured_ingest/src/local-single-file.sh index 249746ed8a..d39cccc8c3 100755 --- a/test_unstructured_ingest/src/local-single-file.sh +++ b/test_unstructured_ingest/src/local-single-file.sh @@ -27,12 +27,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --additional-partition-args '{"strategy":"ocr_only", "languages":["ind", "est"]}' \ --verbose \ --reprocess \ --input-path "$ABS_INPUT_PATH" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local.sh b/test_unstructured_ingest/src/local.sh index 3c7139cebe..eb4eed4e6b 100755 --- a/test_unstructured_ingest/src/local.sh +++ b/test_unstructured_ingest/src/local.sh @@ -26,11 +26,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --strategy hi_res \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --file-glob "*.html" \ --input-path example-docs \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 15 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/mongodb.sh b/test_unstructured_ingest/src/mongodb.sh index 8429d7e1fd..a2afdaee88 100755 --- a/test_unstructured_ingest/src/mongodb.sh +++ b/test_unstructured_ingest/src/mongodb.sh @@ -33,7 +33,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ - --output-dir "$OUTPUT_DIR" \ --uri "$MONGODB_URI" \ --database "$MONGODB_DATABASE_NAME" \ --collection "$SOURCE_MONGO_COLLECTION" \ @@ -41,6 +40,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --preserve-downloads \ --reprocess \ --batch-size 2 \ - --verbose + --verbose \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/notion.sh b/test_unstructured_ingest/src/notion.sh index e80a11bfad..91b790f74b 100755 --- a/test_unstructured_ingest/src/notion.sh +++ b/test_unstructured_ingest/src/notion.sh @@ -35,12 +35,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --download-dir "$DOWNLOAD_DIR" \ --notion-api-key "$NOTION_API_KEY" \ - --output-dir "$OUTPUT_DIR" \ --database-ids "122b2c22996b435b9de2ee0e9d2b04bc" \ --num-processes "$max_processes" \ --recursive \ --verbose \ --work-dir "$WORK_DIR" \ - --max-retry-time 30 + --max-retry-time 30 \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/onedrive.sh b/test_unstructured_ingest/src/onedrive.sh index d38b7ab80c..fb4e8e7f51 100755 --- a/test_unstructured_ingest/src/onedrive.sh +++ b/test_unstructured_ingest/src/onedrive.sh @@ -38,7 +38,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --client-cred "$MS_CLIENT_CRED" \ --client-id "$MS_CLIENT_ID" \ @@ -46,6 +45,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --user-pname "$MS_USER_PNAME" \ --path '/utic-test-ingest-fixtures' \ --recursive \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/opensearch.sh b/test_unstructured_ingest/src/opensearch.sh index f1d7c150ed..5d76a8ba2f 100755 --- a/test_unstructured_ingest/src/opensearch.sh +++ b/test_unstructured_ingest/src/opensearch.sh @@ -43,7 +43,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --index-name movies \ --hosts http://localhost:9247 \ @@ -52,6 +51,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --use-ssl \ --fields 'ethnicity,director,plot' \ --work-dir "$WORK_DIR" \ - --batch-size 2 + --batch-size 2 \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/outlook.sh b/test_unstructured_ingest/src/outlook.sh index a1a5a48784..77bfeeb197 100755 --- a/test_unstructured_ingest/src/outlook.sh +++ b/test_unstructured_ingest/src/outlook.sh @@ -37,7 +37,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --client-cred "$MS_CLIENT_CRED" \ --client-id "$MS_CLIENT_ID" \ @@ -45,6 +44,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --user-email "$MS_USER_EMAIL" \ --outlook-folders IntegrationTest \ --recursive \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/pdf-fast-reprocess.sh b/test_unstructured_ingest/src/pdf-fast-reprocess.sh index b27e32e8ef..1f22cab06c 100755 --- a/test_unstructured_ingest/src/pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/src/pdf-fast-reprocess.sh @@ -35,11 +35,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --strategy fast \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --file-glob "*.pdf" \ --input-path "$INPUT_PATH" \ --recursive \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/s3-compression.sh b/test_unstructured_ingest/src/s3-compression.sh index 7ee066f3a3..aded270857 100755 --- a/test_unstructured_ingest/src/s3-compression.sh +++ b/test_unstructured_ingest/src/s3-compression.sh @@ -29,12 +29,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy fast \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/small-pdf-set-w-compression/ \ --anonymous \ --work-dir "$WORK_DIR" \ - --uncompress + --uncompress \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/s3-minio.sh b/test_unstructured_ingest/src/s3-minio.sh index 85dd8f85d0..3a63def407 100755 --- a/test_unstructured_ingest/src/s3-minio.sh +++ b/test_unstructured_ingest/src/s3-minio.sh @@ -42,11 +42,12 @@ AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/ \ --endpoint-url http://localhost:9000 \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/s3.sh b/test_unstructured_ingest/src/s3.sh index bfdc72c1cb..228f2b9b25 100755 --- a/test_unstructured_ingest/src/s3.sh +++ b/test_unstructured_ingest/src/s3.sh @@ -32,11 +32,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \ --anonymous \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/salesforce.sh b/test_unstructured_ingest/src/salesforce.sh index 54ebd05558..d726b8e9c0 100755 --- a/test_unstructured_ingest/src/salesforce.sh +++ b/test_unstructured_ingest/src/salesforce.sh @@ -55,8 +55,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/sftp.sh b/test_unstructured_ingest/src/sftp.sh index e3312224df..50325902e4 100755 --- a/test_unstructured_ingest/src/sftp.sh +++ b/test_unstructured_ingest/src/sftp.sh @@ -41,12 +41,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.last_modified,metadata.data_source.version \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --recursive \ --username foo \ --password bar \ --remote-url sftp://localhost:47474/upload/ \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/sharepoint-with-permissions.sh b/test_unstructured_ingest/src/sharepoint-with-permissions.sh index cc16c1135c..766fcfd08a 100755 --- a/test_unstructured_ingest/src/sharepoint-with-permissions.sh +++ b/test_unstructured_ingest/src/sharepoint-with-permissions.sh @@ -48,7 +48,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --client-cred "$SHAREPOINT_CRED" \ --client-id "$SHAREPOINT_CLIENT_ID" \ @@ -57,6 +56,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \ --permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \ --recursive \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/sharepoint.sh b/test_unstructured_ingest/src/sharepoint.sh index ea07410d2f..9ac1444252 100755 --- a/test_unstructured_ingest/src/sharepoint.sh +++ b/test_unstructured_ingest/src/sharepoint.sh @@ -40,13 +40,14 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --client-cred "$SHAREPOINT_CRED" \ --client-id "$SHAREPOINT_CLIENT_ID" \ --site "$SHAREPOINT_SITE" \ --recursive \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/slack.sh b/test_unstructured_ingest/src/slack.sh index 503e67240b..0fb4a710e8 100755 --- a/test_unstructured_ingest/src/slack.sh +++ b/test_unstructured_ingest/src/slack.sh @@ -38,12 +38,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --channels C07ABKJ83C6 \ --token "${SLACK_TOKEN}" \ --start-date 2023-04-01 \ --end-date 2024-07-01T07:47:00-07:00 \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/wikipedia.sh b/test_unstructured_ingest/src/wikipedia.sh index 21a55e5725..657853c9ab 100755 --- a/test_unstructured_ingest/src/wikipedia.sh +++ b/test_unstructured_ingest/src/wikipedia.sh @@ -32,9 +32,10 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --strategy hi_res \ --preserve-downloads \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --page-title "Open Source Software" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh index 7fca5ede6c..3a0305b781 100755 --- a/test_unstructured_ingest/test-ingest-src.sh +++ b/test_unstructured_ingest/test-ingest-src.sh @@ -18,57 +18,17 @@ EVAL_OUTPUT_ROOT=${EVAL_OUTPUT_ROOT:-$SCRIPT_DIR} export OMP_THREAD_LIMIT=1 all_tests=( - # NOTE(scanny): This test is disabled because it routinely flakes on OCR differencs - # 's3.sh' 's3-minio.sh' 'astradb.sh' 'azure.sh' - 'biomed-api.sh' - 'biomed-path.sh' # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files 'pdf-fast-reprocess.sh' - 'salesforce.sh' - 'box.sh' - 'discord.sh' - 'dropbox.sh' - 'github.sh' - 'gitlab.sh' - 'google-drive.sh' - 'wikipedia.sh' 'local.sh' - # 'slack.sh' - 'against-api.sh' - 'gcs.sh' - 'kafka-local.sh' - #'onedrive.sh' - #'outlook.sh' - 'elasticsearch.sh' - 'confluence-diff.sh' - 'confluence-large.sh' - # NOTE(christine): This test is disabled because it is triggering 404 client errors to the API - # 'airtable-diff.sh' - # # NOTE(ryan): This test is disabled because it is triggering too many requests to the API - # 'airtable-large.sh' 'local-single-file.sh' 'local-single-file-basic-chunking.sh' 'local-single-file-chunk-no-orig-elements.sh' 'local-single-file-with-encoding.sh' 'local-single-file-with-pdf-infer-table-structure.sh' - 'notion.sh' - 'delta-table.sh' - 'jira.sh' - # 'sharepoint.sh' - # 'sharepoint-with-permissions.sh' - 'hubspot.sh' - 'local-embed.sh' - 'local-embed-bedrock.sh' - 'local-embed-octoai.sh' - 'local-embed-vertexai.sh' - 'local-embed-voyageai.sh' - 'local-embed-mixedbreadai.sh' - 'sftp.sh' - 'opensearch.sh' - 'mongodb.sh' ) full_python_matrix_tests=( @@ -79,8 +39,6 @@ full_python_matrix_tests=( 'local-single-file-with-pdf-infer-table-structure.sh' # NOTE(scanny): This test is disabled because it routinely flakes on OCR differences # 's3.sh' - 'google-drive.sh' - 'gcs.sh' 'azure.sh' ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1c6678160c..657c99ab3b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.6-dev1" # pragma: no cover +__version__ = "0.17.6-dev2" # pragma: no cover From b585df15881219ff3b1dcb06208b4e5cd3987ae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= <124889668+mpolomdeepsense@users.noreply.github.com> Date: Tue, 29 Apr 2025 15:29:44 +0200 Subject: [PATCH 12/40] fix: Add missing diffstat command to test_json_to_html CI job (#3992) Removed some additional html fixtures. The original json fixtures from which html ones were generated, were removed some time ago. --- .github/workflows/ci.yml | 1 + Makefile | 1 + .../biomed-api/65/11/main.PMC6312790.pdf.html | 563 ------------------ .../biomed-api/75/29/main.PMC6312793.pdf.html | 329 ---------- .../07/07/sbaa031.073.PMC7234218.pdf.html | 53 -- 5 files changed, 2 insertions(+), 945 deletions(-) delete mode 100644 test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.html delete mode 100644 test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.html delete mode 100644 test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81a9f20cfb..94e2d08612 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -345,6 +345,7 @@ jobs: PYTHONPATH: ${{ github.workspace }} run: | source .venv/bin/activate + sudo apt-get install diffstat ./test_unstructured_ingest/check-diff-expected-output-html.sh test_unstructured_api_unit: diff --git a/Makefile b/Makefile index 80600a051a..fe1350d5f5 100644 --- a/Makefile +++ b/Makefile @@ -340,4 +340,5 @@ run-jupyter: .PHONY: html-fixtures-update html-fixtures-update: + rm -r test_unstructured_ingest/expected-structured-output-html && \ test_unstructured_ingest/structured-json-to-html.sh test_unstructured_ingest/expected-structured-output-html diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.html deleted file mode 100644 index 5dfa8c4b41..0000000000 --- a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.html +++ /dev/null @@ -1,563 +0,0 @@ - - - - - - Codestin Search App - - -
    - Data in Brief 22 (2019) 451–457 -
    -

    - Contents lists available at ScienceDirect -

    -

    - Data in Brief -

    -

    - journal homepage: www.elsevier.com/locate/dib -

    -

    - Data Article -

    -

    - Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment -

    -

    - Omotayo Sanni n, Abimbola Patricia I. Popoola -

    -

    - Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa -

    -

    - a r t i c l e i n f o -

    -

    - a b s t r a c t -

    -

    - Article history: Received 31 August 2018 Received in revised form 17 November 2018 Accepted 27 November 2018 Available online 30 November 2018 -

    -

    - Keywords: Corrosion Stainless steel Inhibitor Sulphuric acid -

    -

    - This data article contains data related to the research article entitled “enhanced corrosion resistance of stainless steel Type 316 in sulphuric acid solution using eco-friendly waste product†(Sanni et al., 2018). In this data article, a comprehensive effect of waste product and optimized process parameter of the inhibitor in 0.5 M H2SO4 solution was presented using weight loss and potentiody- the inhibitor namic polarization techniques. The presence of (egg shell powder) influenced corrosion resistance of stainless steel. Inhibition efï¬ciency value of 94.74% was recorded as a result of inhibition of the steel by the ionized molecules of the inhibiting compound of the egg shell powder influencing the redox mechan- ism reactions responsible for corrosion and surface deterioration. -

    -

    - & 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/). -

    -

    - Speciï¬cation table -

    -

    - Subject area More speciï¬c subject area Surface science and engineering Type of data -

    -

    - Materials engineering -

    -

    - Table and ï¬gure -

    -

    - n Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za -

    -

    - E-mail address: tayo.sanni@yahoo.com (O. Sanni). -

    -

    - https://doi.org/10.1016/j.dib.2018.11.134 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/). -

    -

    - 452 -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -

    - How data were acquired -

    -

    - Data format Experimental factors -

    -

    - Experimental features Data source location -

    -

    - Accessibility Related research article -

    -

    - The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efï¬ciency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225–230. -

    -

    - Value of the data -

    -

    - (cid:1) Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment. -

    -

    - (cid:1) The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments. -

    -

    - (cid:1) The data can be used to examine the relationship between the process variable as it affect the -

    -

    - nature of inhibition of metals. -

    -
  • - 1. Data -
  • -

    - The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs.1–3 respectively. It can be seen clearly from these Figures that the efï¬ciency of egg shell powder increase with the inhibitor con- centration, The increase in its efï¬ciency could be as a result of increase in the constituent molecule -

    -

    - ) g m -

    -

    - ( -

    -

    - s s o -

    -

    - l -

    -

    - t h g e W -

    -

    - i -

    -

    - 30 -

    -

    - 20 -

    -

    - 10g 8g 6g 4g 2g Control -

    -

    - 10 -

    -

    - 48 -

    -

    - 96 -

    -

    - 144 -

    -

    - 192 -

    -

    - Exposure Time (Hours) -

    -

    - Fig. 1. Weight loss versus exposure time for stainless steel presence of ES. -

    -

    - immersed in 0.5 M H2SO4 solution in the absence and -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -

    - 2.7 -

    -

    - ) r a e y / m m -

    -

    - ( e t a r n o s o r r o C -

    -

    - i -

    -

    - 1.8 -

    -

    - 0.9 -

    -

    - 10g 8g 6g 4g 2g Control -

    -

    - 24 -

    -

    - 48 -

    -

    - 72 -

    -

    - 96 -

    -

    - 120 -

    -

    - 144 -

    -

    - 168 -

    -

    - 192 -

    -

    - Exposure time -

    -

    - Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES. -

    -

    - 100 -

    -

    - 90 -

    -

    - ) -

    -

    - % -

    -

    - ( -

    -

    - y c n e c i f f -

    -

    - i -

    -

    - E n o i t i b h n I -

    -

    - i -

    -

    - 80 -

    -

    - 70 -

    -

    - 60 -

    -

    - 50 -

    -

    - 40 -

    -

    - 30 -

    -

    - 2g 4g 6g 8g 10g -

    -

    - 20 -

    -

    - 10 -

    -

    - 0 -

    -

    - 20 -

    -

    - 40 -

    -

    - 60 -

    -

    - 80 -

    -

    - 100 -

    -

    - 120 -

    -

    - 140 -

    -

    - 160 -

    -

    - 180 -

    -

    - Exposure Time (Hours) -

    -

    - Fig. 3. Inhibition efï¬ciency versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the presence of ES. -

    -

    - number of inhibitor adsorbed on the surface of stainless steel at higher concentration, in order for the active sites of the stainless steel to be protected with the inhibitor molecules. Cathodic and anodic polarized potential are measured in the presence and absence of ES. Fig. 4 shows the cathodic and anodic polarization curves for stainless steel in 0.5 M H2SO4 solution at different ES concentrations. The electrochemical variables such as polarization resistance (PR), corrosion potential (Ecorr), cor- rosion current (icorr), anodic Tafel constant (ba), cathodic Tafel constant (bc) and corrosion rate (mm/ year) values are presented in Table 1. From the polarization curves and electrochemical parameter, icorr value decreased with the addition of inhibitor in 0.5 M H2SO4. Conversely, the icorr further decrease with an increase in inhibitor concentration indicating that the inhibition effects increase with an increase in the egg shell concentration. The process of egg shell inhibition could be attributed to the formation of egg shell powder adsorbed on stainless steel surface protecting corrosion of stainless steel in H2SO4 medium. The likely mechanism is the egg shell adsorption on stainless steel surface through the heteroatoms electron pair and the conjugated systems in egg shell molecular structure as shown in Fig. 1. When the concentration of inhibitor was increased from 2 to 10 g, the corrosion rate values drastically decreased this result show that waste egg shell powder is an effective corrosion inhibitor for stainless steel in H2SO4 solution. The shift in corrosion potential of stainless steel from Tafel curves and electrochemical data indicate that the inhibitor is a mixed-type corrosion inhibitor. -

    -

    - 453 -

    -

    - 454 -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -

    - Fig. 4. Anodic and cathodic polarization curve of stainless steel in 0.5 M H2SO4 solution in the presence and absence of ES. -

    -

    - Table 1 Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO4 solution. -

    -

    - Inhibitor concentration (g) -

    -

    - bc (V/dec) -

    -

    - ba (V/dec) -

    -

    - Ecorr (V) -

    -

    - icorr (A/cm2) -

    -

    - Polarization resistance (Ω) -

    -

    - Corrosion rate (mm/year) -

    -

    - 0 2 4 6 8 10 -

    -

    - 0.0335 1.9460 0.0163 0.3233 0.1240 0.0382 -

    -

    - 0.0409 0.0596 0.2369 0.0540 0.0556 0.0086 -

    -

    - (cid:3)0.9393 (cid:3)0.8276 (cid:3)0.8825 (cid:3)0.8027 (cid:3)0.5896 (cid:3)0.5356 -

    -

    - 0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05 -

    -

    - 24.0910 121.440 42.121 373.180 305.650 246.080 -

    -

    - 2.8163 1.5054 0.9476 0.4318 0.3772 0.0919 -

    -

    - The plot of inhibitor concentration over degree of surface coverage versus inhibitor concentration gives a straight line as shown in Fig. 5. The strong correlation reveals that egg shell adsorption on stainless surface in 0.5 M H2SO4 follow Langmuir adsorption isotherm. Figs. 6–8 show the SEM/EDX surface morphology analysis of stainless steel. Figs. 7 and 8 are the SEM/EDX images of the stainless steel specimens without and with inhibitor after weight loss experiment in sulphuric acid medium. The stainless steel surface corrosion product layer in the absence of inhibitor was porous and as a result gives no corrosion protection. With the presence of ES, corrosion damage was minimized, with an evidence of ES present on the metal surface as shown in Fig. 8. -

    -

    - 12 -

    -

    - C/0 -

    -

    - 10 -

    -

    - 8 -

    -

    - 0 / C -

    -

    - 6 -

    -

    - 4 -

    -

    - 2 -

    -

    - 2 -

    -

    - 4 -

    -

    - 6 -

    -

    - 8 -

    -

    - 10 -

    -

    - Concentration (g) -

    -

    - Fig. 5. Langmuir adsorption isotherm of ES. -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -

    - Fig. 6. SEM/EDX image of as-received stainless steel. -

    -

    - Fig. 7. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution without inhibitor. -

    -

    - Fig. 8. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution with the presence of inhibitor. -

    -

    - 455 -

    -

    - 456 -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -
  • - 2. Experimental design, materials and methods -
  • -

    - 2.1. Material -

    -

    - Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [3–5]. The structural formula of egg shell powder is shown in Fig. 9. -

    -

    - Fig. 9. Chemical structure of egg shell powder. -

    -

    - 2.2. Weight loss method -

    -

    - This physical measurement was carried out in order to provide direct result on how the corrosive environment affects the test sample. The cleaned and weighed specimen was suspended in beakers with the aid of glass hooks and rods with the test solution of ES at different concentration (2, 4, 6, 8 and 10 g). The pre-weighed specimen was retrieved from the test solution after every 24 h, cleaned, dried and reweighed. The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss which was used to calculate corrosion rate and inhibition efï¬ciency. -

    -

    - The corrosion rate (CR) was calculated using Eq. (1) [1–5] -

    -

    - (cid:1) Ă Â¼ 87:6W DAT -

    -

    - (cid:3) -

    -

    - Corrosion rate CRð -

    -

    - where: W is weight loss in mg, A is specimen surface area, T is immersion period in hours and D is the specimen density. From the corrosion rate, the surface coverage (θ) and inhibition efï¬ciencies (IE %) were determined using Eqs. (2) and (3) respectively -

    -

    - θ ¼ CRo(cid:3)CR -

    -

    - CRo -

    -

    - IE ð%Ă Â¼ CRo(cid:3)CR -

    -

    - CRo -

    -

    - x -

    -

    - 100 1 -

    -

    - where: CRo and CR are the corrosion rate in absence and presence of inhibitor respectively. -

    -

    - 2.3. Potentiodynamic polarization method -

    -

    - The potentiodynamic polarization method was performed on the prepared test samples immersed in 0.5 M H2SO4 solution in the presence and absence of different ES concentrations. A three electrode system was used; stainless steel Type 316 plate as working electrode with an exposed area of 1.0 cm2, platinum rod as counter electrode and silver chloride electrode as reference electrode. The electrode was polished, degreased in acetone and thoroughly rinsed with distilled water before the experiment. Current density against applied potential was plotted. The slope of the linear part in anodic and cathodic plots gives anodic and cathodic constants according to the Stern–Geary equation, and the -

    -

    - ð1à -

    -

    - ð2à -

    -

    - ð3à -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -

    - steps of the linear polarization plot are substituted to get corrosion current. Nova software was used with linear polarization resistance (LPR) and the current was set to 10 mA (maximum) and 10 nA (minimum). LSV staircase parameter start potential (cid:3)1.5 v, step potential 0.001 m/s and stop potential of Ă¾1.5 v set was used in this study. -

    -

    - Acknowledgements -

    -

    - This work was supported by the National Research Foundation of South Africa and the Tshwane -

    -

    - University of Technology Pretoria South Africa. -

    -

    - Transparency document. Supporting information -

    -

    - Transparency document associated with this article can be found in the online version at https://doi. -

    -

    - org/10.1016/j.dib.2018.11.134. -

    -

    - References -

    -

    - [1] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution -

    -

    - using eco-friendly waste product, Results Phys. 9 (2018) 225–230. -

    -

    - [2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion -

    -

    - inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1–15. -

    -

    - [3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel -

    -

    - corrosion in chloride solution, Def. Technol. 14 (2018) 463–468. -

    -

    - [4] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1–17. https://doi.org/10.1007/ s13632-018-0495-5. -

    -

    - [5] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Inhibition of engineering material in sulphuric acid solution using waste product, Contributed Papers from Materials Science and Technology (MS&T18), 2018. 〈https://doi.org/10.7449/2018/MST_2018_254_261〉. -

    -

    - 457 -

    - - diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.html deleted file mode 100644 index 9c6a0058ce..0000000000 --- a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.html +++ /dev/null @@ -1,329 +0,0 @@ - - - - - - Codestin Search App - - -
    - Data in Brief 22 (2019) 484–487 -
    -

    - Contents lists available at ScienceDirect -

    -

    - Data in Brief -

    -

    - journal homepage: www.elsevier.com/locate/dib -

    -

    - Data Article -

    -

    - A benchmark dataset for the multiple depot vehicle scheduling problem -

    -

    - Sarang Kulkarni a,b,c,n, Mohan Krishnamoorthy d,e, Abhiram Ranade f, Andreas T. Ernst c, Rahul Patil b -

    -

    - a IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India b SJM School of Management, IIT Bombay, Powai, Mumbai 400076, India c School of Mathematical Sciences, Monash University, Clayton, VIC 3800, Australia d Department of Mechanical and Aerospace Engineering, Monash University, Clayton, VIC 3800, Australia e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072, Australia f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India -

    -

    - a r t i c l e i n f o -

    -

    - a b s t r a c t -

    -

    - Article history: Received 21 November 2018 Received in revised form 13 December 2018 Accepted 15 December 2018 Available online 18 December 2018 -

    -

    - This data article presents a description of a benchmark dataset for the multiple depot vehicle scheduling problem (MDVSP). The MDVSP is to assign vehicles from different depots to timetabled trips to minimize the total cost of empty travel and waiting. The dataset has been developed to evaluate the heuristics of the MDVSP that are presented in “A new formulation and a column generation-based heuristic for the multiple depot vehicle sche- duling problem†(Kulkarni et al., 2018). The dataset contains 60 problem instances of varying size. Researchers can use the dataset to evaluate the future algorithms for the MDVSP and compare the performance with the existing algorithms. The dataset includes a program that can be used to generate new problem instances of the MDVSP. -

    -

    - & 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/). -

    -

    - DOI of original article: https://doi.org/10.1016/j.trb.2018.11.007 n Corresponding author at: IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India. -

    -

    - E-mail address: sarangkulkarni@iitb.ac.in (S. Kulkarni). -

    -

    - https://doi.org/10.1016/j.dib.2018.12.055 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/). -

    -
    - S. Kulkarni et al. / Data in Brief 22 (2019) 484–487 -
    -

    - 485 -

    -

    - Speciï¬cations table -

    -

    - Subject area Operations research More speciï¬c subject area Vehicle scheduling Type of data How data were acquired -

    -

    - Tables, text ï¬les Artiï¬cially generated by a CĂ¾ Ă¾ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8,12,16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3]. -

    -

    - Data format Experimental factors -

    -

    - Experimental features Data source location Data accessibility Related research article -

    -

    - Value of the data -

    -

    - (cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the -

    -

    - performance of the algorithms for the MDVSP. -

    -

    - (cid:2) The data provide all the information that is required to model the MDVSP by using the existing -

    -

    - mathematical formulations. -

    -

    - (cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can -

    -

    - be used for the comparison. -

    -

    - (cid:2) The dataset includes a program that can generate similar problem instances of different sizes. -

    -
  • - 1. Data -
  • -

    - The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate ï¬le. Each ï¬le is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number ‘RN-8–1500-01.dat’, for is the ï¬rst problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm;nĂ, ï¬ve instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. -

    -

    - â€˜Ă°m;nĂ’, -

    -

    - the size, -

    -

    - respectively. For example, -

    -

    - the problem instance, -

    -

    - For each problem instance, the following information is provided: The number of depots mð The number of trips ðnĂ, The number of locations ðlĂ, The number of vehicles at each depot, For each trip iA1;2;…;n, a start time, ts -

    -

    - Ă, -

    -

    - i , a start location, ls -

    -

    - i, an end time, te -

    -

    - i, and an end location, le i , -

    -

    - and -

    -

    - (cid:2) The travel time, δij, between any two locations i;jA1;…;l. -

    -

    - All times are in minutes and integers. The planning duration is from 5 a.m. to around midnight. Each instance has two classes of trips, short trips and long trips, with 40% short trips and 60% long trips. The duration of a short trip is less than a total of 45 min and the travel time between the start -

    -

    - 486 -

    -
    - S. Kulkarni et al. / Data in Brief 22 (2019) 484–487 -
    -

    - and end location of the trip. A long trip is about 3–5 h in duration and has the same start and end location. For all instances, mrl and the locations 1;…;m correspond to depots, while the remaining locations only appear as trip start and end locations. -

    -

    - i Ă¾Î´ -

    -

    - . If le i ls le i j, otherwise, the vehicle may require waiting at le i for the duration of ðts -

    -

    - Zte -

    -

    - als -

    -

    - A trip j can be covered after trip i by the same vehicle, if ts j -

    -

    - j, the vehicle must travel empty from le j (cid:3)te i Ă. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satisï¬ed: -

    -

    - j -

    -

    - i to ls -

    -
  • - 1. Each schedule should start and end at the same depot. 2. Each trip should be covered by only one vehicle. 3. The number of schedules that start from a depot should not exceed the number of vehicles at the depot. -
  • -

    - A sufï¬cient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm;nĂ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over ï¬ve instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule. -

    -

    - The description of the ï¬le for each problem instance is presented in Table 2. The ï¬rst line in the ï¬le provides the number of depots ðmĂ, the number of trips, ðnĂ, and the number of locations ðlĂ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, iA 1;…;n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i;jA 1;…;l -

    -

    - f -

    -

    - (cid:1) -

    -

    - (cid:3) -

    -

    - . -

    -

    - The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmĂ, the number of trips ðnĂ, and the number of instances for each size ðm;nĂ. -

    -

    - Table 1 Average number of locations, times, vehicles and empty travels for each instance size. -

    -

    - Instance size (m, n) -

    -

    - Average number of -

    -

    - Locations -

    -

    - Times -

    -

    - Vehicles -

    -

    - Possible empty travels -

    -

    - (8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000) -

    -

    - 568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20 -

    -

    - 975.20 1048.00 1078.00 1113.20 994.00 1040.60 1081.00 1107.40 985.40 1040.60 1083.20 1101.60 -

    -

    - 652.20 857.20 1082.40 1272.80 642.00 861.20 1096.00 1286.20 667.80 872.40 1076.40 1284.60 -

    -

    - 668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60 -

    -
    - S. Kulkarni et al. / Data in Brief 22 (2019) 484–487 -
    -

    - Table 2 Description of ï¬le format for each problem instance. -

    -

    - Number of lines -

    -

    - Number of columns in each line -

    -

    - Description -

    -

    - 1 1 n -

    -

    - l -

    -

    - 3 m 4 -

    -

    - l -

    -

    - The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1;2;…;n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i;jA1;2;…;l, refers to the travel time between location i and location j. -

    -

    - i, the start -

    -

    - i, the end location le -

    -
  • - 2. Experimental design, materials, and methods -
  • -

    - The procedure presented by Carpaneto et al. in [1] is used to generate the problem instances. The same procedure has been used by Pepin et al. in [4] to generate the benchmark dataset of the MDVSP. A detailed description of the procedure is presented in [3]. -

    -

    - Our dataset provides start/end location and time of trips as well as the travel time between any two locations. The location and time information is required to model the MDVSP on a time-space network. The feasible connections and the cost of connections between the trips can be obtained as discussed in [3]. Thus, the dataset has all the information that is required to model the MDVSP on the time-space network (see [2]) as well as the connection-network (see [5]). The benchmark solutions for all the problem instances are presented in [3]. -

    -

    - Transparency document. Supporting information -

    -

    - Transparency document associated with this article can be found in the online version at https://doi. -

    -

    - org/10.1016/j.dib.2018.12.055. -

    -

    - References -

    -

    - [1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling -

    -

    - problem, Networks 19 (5) (1989) 531–548. -

    -

    - [2] N. Kliewer, T. Mellouli, L. Suhl, A time–space network based exact optimization model for multi-depot bus scheduling, Eur. -

    -

    - J. Oper. Res. 175 (3) (2006) 1616–1627. -

    -

    - [3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic -

    -

    - for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457–487. -

    -

    - [4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of ï¬ve heuristics for the multiple depot vehicle scheduling -

    -

    - problem, J. Sched. 12 (1) (2009) 17. -

    -

    - [5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1) -

    -

    - (1994) 41–52. -

    -

    - 487 -

    - - diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html deleted file mode 100644 index fc5c096764..0000000000 --- a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - Codestin Search App - - -
    - S32 -
    -

    - ns; 40 mg/day=3.6%, p<0.05; 80 mg/day=4.9%, p<0.01; 120 mg/day=9.3%, p<0.001, PM dosing group: 20 mg/day=-0.4%, ns; 40 mg/day=2.8%, p<0.05: 80 mg/day=0.2%, ns; 160 mg/day=5.8%, p<0.05). There was no clear dose-dependent trend associated with nausea and RD was similar between AM and PM dosing group (AM dosing group: 20 mg/ day=0.2% ns; 40 mg/day=3.8%, p<0.05; 80 mg/day=3.8%, ns; 120 mg/ day=6.6%, ns, PM dosing group: 20 mg/day=-1.6%, ns; 40 mg/day=-1.7%, ns; 80 mg/day=5.5%, p<0.01; 160 mg/day=2.8%, ns). Discussion: The risk of adverse events in the treatment of schizophrenia with lurasidone can vary depending on the timing of administration. In particular, for akathisia and somnolence, the incidence risks were reduced when lurasidone was administered in PM. Unlike with AM administration, the dose-dependence in the risks of these adverse events were not observed in lurasidone PM administration. The timing of lurasidone administration could be considered in effort to minimize potential adverse events. -

    -

    - S6. SLEEP ENDOPHENOTYPES OF SCHIZOPHRENIA: A HIGH-DENSITY EEG STUDY IN DRUG-NAĂVE, FIRST EPISODE PSYCHOSIS PATIENTS -

    -

    - Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy -

    -

    - Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC. -

    - -
    - Poster Session I -
    -

    - Discussion: Our data confirm previous findings on reduced slow wave density in FEP, and expand them to acute subjects, before any treatment is prescribed. This is in line with available data on diffuse abnormalities of cortico-cortical and cortico-thalamic networks in these patients. Interestingly, our data also offer preliminary evidence that this deficit is specific for SCZ, as it appears to differentiate patients who developed SCZ from those with other diagnoses at follow-up. Given the traveling properties of slow waves, future research should establish their potential as markers of connectivity in SCZ. -

    -

    - S7. INVESTIGATING THE LINK BETWEEN THE PERIPHERAL ENDOCANNABINOID SYSTEM AND CENTRAL GLUTAMATERGIC NEUROTRANSMISSION IN EARLY PSYCHOSIS: A 7T-MRS STUDY -

    -

    - Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford -

    -

    - Background: Meta-analytic evidence showed increased levels of periph- eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their first non-affective psychotic episode (NA-FEP) and healthy controls. Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session. Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP Discussion: This study will help clarifying the contribution of peripheral endocannabinoid system to central brain mechanisms of key relevance for psychotic illness. It will also add further evidence on the limited literature on high-resolution characterisation of brain metabolites in early psychosis. Strengths of the study include: (i) use of high-field MRS, which allows the estimation of glutamate-related compounds at higher precision than at lower field strength; (ii) reduced heterogeneity of the clinical sample (only male and NA-FEP). Limitations: small sample size and cross-sectional design. -

    -

    - S8. GRIN1 PROMOTER METHYLATION CHANGES IN BLOOD OF EARLY-ONSET PSYCHOTIC PATIENTS AND UNAFFECTED SIBLINGS WITH CHILDHOOD TRAUMA -

    -

    - Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2, -

    -

    - AQ3 -

    - - From 604c4a7c5e06381f6480c811bcd9caea949a1366 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= <124889668+mpolomdeepsense@users.noreply.github.com> Date: Tue, 29 Apr 2025 15:29:58 +0200 Subject: [PATCH 13/40] fix: failing build (#3993) Successful build and test: https://github.com/Unstructured-IO/unstructured/actions/runs/14730300234/job/41342657532 Failing test_json_to_html CI job fix here: https://github.com/Unstructured-IO/unstructured/pull/3992 --- scripts/docker-smoke-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/docker-smoke-test.sh b/scripts/docker-smoke-test.sh index 1d0950e923..0e66e05ae4 100755 --- a/scripts/docker-smoke-test.sh +++ b/scripts/docker-smoke-test.sh @@ -41,7 +41,7 @@ await_container docker cp test_unstructured_ingest $CONTAINER_NAME:/app docker cp requirements/ingest $CONTAINER_NAME:/app/requirements/ingest docker exec -u root "$CONTAINER_NAME" /bin/bash -c "chown -R notebook-user:notebook-user /app/test_unstructured_ingest" -docker exec "$CONTAINER_NAME" /bin/bash -c "/app/test_unstructured_ingest/src/wikipedia.sh" +docker exec "$CONTAINER_NAME" /bin/bash -c "/app/test_unstructured_ingest/src/local.sh" result=$? exit $result From b814ece39f5a66e459f4ddcd21d7dafa882bb572 Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 5 May 2025 13:08:11 -0500 Subject: [PATCH 14/40] fix: properly handle the case when an element's text is None (#3995) Some elements, like `Image`, can have `None` as its `text` attribute's value. In that case current chunking logic fails because it expects the field to always have a length or can be split. The fix is to update the logic as `element.text or ""` for checking length and add flow control to early exit to avoid calling split on `None`. --- CHANGELOG.md | 5 +++-- test_unstructured/chunking/test_base.py | 11 +++++++++++ unstructured/__version__.py | 2 +- unstructured/chunking/base.py | 4 +++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad5dea531f..a9b4c3ca53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.17.6-dev2 +## 0.17.6 ### Enhancements @@ -10,6 +10,7 @@ Two executions of the same code, on the same file, produce different results. Th This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) - Resolve open CVEs +- Properly handle the case when an element's `text` attribute is None ## 0.17.5 @@ -48,7 +49,7 @@ This makes it impossible to write stable unit tests, for example, or to obtain r ### Features ### Fixes -- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml +- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml ## 0.17.2 diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index f63e738a7c..ffaa699cac 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -31,6 +31,7 @@ CompositeElement, Element, ElementMetadata, + Image, PageBreak, Table, TableChunk, @@ -234,6 +235,10 @@ def it_accumulates_elements_added_to_it(self): assert builder._text_length == 112 assert builder._remaining_space == 36 + def it_will_fit_when_element_has_none_as_text(self): + builder = PreChunkBuilder(opts=ChunkingOptions()) + assert builder.will_fit(Image(None)) + def it_will_fit_an_oversized_element_when_empty(self): builder = PreChunkBuilder(opts=ChunkingOptions()) assert builder.will_fit(Text("abcd " * 200)) @@ -405,6 +410,12 @@ def and_it_knows_it_is_NOT_equal_to_an_object_that_is_not_a_PreChunk(self): pre_chunk = PreChunk([], overlap_prefix="", opts=ChunkingOptions()) assert pre_chunk != 42 + def it_can_handle_element_with_none_as_text(self): + pre_chunk = PreChunk( + [Image(None), Text("hello")], overlap_prefix="", opts=ChunkingOptions() + ) + assert pre_chunk._text == "hello" + @pytest.mark.parametrize( ("max_characters", "combine_text_under_n_chars", "expected_value"), [ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 657c99ab3b..29149d1540 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.6-dev2" # pragma: no cover +__version__ = "0.17.6" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 695393c55c..17ece85a47 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -387,7 +387,7 @@ def will_fit(self, element: Element) -> bool: if self._text_length > self._opts.soft_max: return False # -- don't add an element if it would increase total size beyond the hard-max -- - return not self._remaining_space < len(element.text) + return not self._remaining_space < len(element.text or "") @property def _remaining_space(self) -> int: @@ -503,6 +503,8 @@ def _iter_text_segments(self) -> Iterator[str]: if self._overlap_prefix: yield self._overlap_prefix for e in self._elements: + if e.text is None: + continue text = " ".join(e.text.strip().split()) if not text: continue From e3417d7e98b8ffba47ed75c65be6cff3fc465764 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Thu, 8 May 2025 17:57:05 -0400 Subject: [PATCH 15/40] fix: Fix for Pillow error when extracting PNG images (#3998) When I tried to partition a PNG file and extract images, I got an error from Pillow: ``` WARNING unstructured:pdf_image_utils.py:230 Image Extraction Error: Skipping the failed image Traceback (most recent call last): File "/Users/austin/.pyenv/versions/unstructured/lib/python3.10/site-packages/PIL/JpegImagePlugin.py", line 666, in _save rawmode = RAWMODE[im.mode] KeyError: 'RGBA' ``` The issue is that a PNG has an additional layer that cannot be saved off in jpeg format. We can fix this with a quick conversion. I added a png test case that is now passing with this fix. --- CHANGELOG.md | 9 +++++++++ .../partition/pdf_image/test_pdf_image_utils.py | 1 + unstructured/__version__.py | 2 +- unstructured/partition/pdf_image/pdf_image_utils.py | 5 +++++ 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9b4c3ca53..20a4bcaf71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.7-dev0 + +### Enhancements + +### Features + +### Fixes +- **Fix image extraction for PNG files.** When `extract_image_block_to_payload` is True, and the image is a PNG, we get a Pillow error. We need to remove the PNG transparency layer before saving the image. + ## 0.17.6 ### Enhancements diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index bfb09b762a..1be79e92a0 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -73,6 +73,7 @@ def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-i [ (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False), (example_doc_path("img/layout-parser-paper-fast.jpg"), True), + (example_doc_path("img/english-and-korean.png"), True), ], ) @pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE]) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 29149d1540..d53993104e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.6" # pragma: no cover +__version__ = "0.17.7-dev0" # pragma: no cover diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index a7e98aa2fa..4365b8dba5 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -204,6 +204,11 @@ def save_elements( image_path = image_paths[page_index] image = Image.open(image_path) cropped_image = image.crop(padded_bbox) + + # PNG images with transparency need to be converted before saving + if cropped_image.mode == "RGBA": + cropped_image = cropped_image.convert("RGB") + if extract_image_block_to_payload: buffered = BytesIO() cropped_image.save(buffered, format="JPEG") From 570ee078a4b5a6b82905133aa6517b828d6325e3 Mon Sep 17 00:00:00 2001 From: jordan-homan <90481160+jordan-homan@users.noreply.github.com> Date: Mon, 19 May 2025 14:24:44 -0400 Subject: [PATCH 16/40] fix: throw validation error when json is passed with invalid unstructured json (#4002) ### Notes Adds validation if `json` / `ndjson` are not valid unstructured schema. ### Testing Manually tested serverless API with example json: ``` test_length = [] = 200 test_invalid = [{"invalid": "schema"}] = 422 test_invalid_ndjson ={"hi": "there"} = 422 test_chunk = [{"type":"Header","element_id":"a23fdadef9277f217563e217ebd074d5" ... = 200 ``` --- CHANGELOG.md | 1 + test_unstructured/partition/test_json.py | 11 +++++++++++ test_unstructured/partition/test_ndjson.py | 16 ++++++++++++++-- unstructured/partition/json.py | 5 +++++ unstructured/partition/ndjson.py | 5 +++++ 5 files changed, 36 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20a4bcaf71..4169fa951c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ ### Fixes - **Fix image extraction for PNG files.** When `extract_image_block_to_payload` is True, and the image is a PNG, we get a Pillow error. We need to remove the PNG transparency layer before saving the image. +- **Throw validation error when json is passed with invalid unstructured json ## 0.17.6 diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py index 7a591953d3..388cbb23c1 100644 --- a/test_unstructured/partition/test_json.py +++ b/test_unstructured/partition/test_json.py @@ -187,6 +187,11 @@ def test_partition_json_works_with_empty_string(): assert partition_json(text="") == [] +def test_partition_json_fails_with_empty_item(): + with pytest.raises(ValueError): + partition_json(text="{}") + + def test_partition_json_works_with_empty_list(): assert partition_json(text="[]") == [] @@ -288,6 +293,12 @@ def test_partition_json_from_text_prefers_metadata_last_modified(): # ------------------------------------------------------------------------------------------------ +def test_partition_json_raises_with_unprocessable_json_array(): + text = '[{"invalid": "schema"}]' + with pytest.raises(ValueError): + partition_json(text=text) + + def test_partition_json_raises_with_unprocessable_json(): # NOTE(robinson) - This is unprocessable because it is not a list of dicts, # per the Unstructured ISD format diff --git a/test_unstructured/partition/test_ndjson.py b/test_unstructured/partition/test_ndjson.py index c86ce1c8e6..3ac5aca98f 100644 --- a/test_unstructured/partition/test_ndjson.py +++ b/test_unstructured/partition/test_ndjson.py @@ -189,8 +189,14 @@ def test_partition_ndjson_works_with_empty_string(): assert partition_ndjson(text="") == [] -def test_partition_ndjson_works_with_empty_list(): - assert partition_ndjson(text="{}") == [] +def test_partition_ndjson_fails_with_empty_item(): + with pytest.raises(ValueError): + partition_ndjson(text="{}") + + +def test_partition_ndjson_fails_with_empty_list(): + with pytest.raises(ValueError): + partition_ndjson(text="[]") def test_partition_ndjson_raises_with_too_many_specified(): @@ -293,6 +299,12 @@ def test_partition_ndjson_from_text_prefers_metadata_last_modified(): # ------------------------------------------------------------------------------------------------ +def test_partition_json_raises_with_unprocessable_json(): + text = '{"invalid": "schema"}' + with pytest.raises(ValueError): + partition_ndjson(text=text) + + def test_partition_json_raises_with_invalid_json(): text = '[{"hi": "there"}]]' with pytest.raises(ValueError): diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py index 4a900de38a..40654487db 100644 --- a/unstructured/partition/json.py +++ b/unstructured/partition/json.py @@ -74,6 +74,11 @@ def partition_json( try: element_dicts = json.loads(file_text) elements = elements_from_dicts(element_dicts) + # if we found at least one json element, but no unstructured elements were found, throw 422 + if len(element_dicts) > 0 and len(elements) == 0: + raise ValueError( + "JSON cannot be partitioned. Schema does not match the Unstructured schema.", + ) except json.JSONDecodeError: raise ValueError("Not a valid json") diff --git a/unstructured/partition/ndjson.py b/unstructured/partition/ndjson.py index 925e71e950..2f4d22343c 100644 --- a/unstructured/partition/ndjson.py +++ b/unstructured/partition/ndjson.py @@ -75,6 +75,11 @@ def partition_ndjson( try: element_dicts = ndjson_loads(file_text) elements = elements_from_dicts(element_dicts) + # if we found at least one json element, but no unstructured elements were found, throw 422 + if len(element_dicts) > 0 and len(elements) == 0: + raise ValueError( + "JSON cannot be partitioned. Schema does not match the Unstructured schema.", + ) except json.JSONDecodeError: raise ValueError("Not a valid ndjson") From 8be7108829a76e40f9f94cf69930542c21def869 Mon Sep 17 00:00:00 2001 From: Ronny H <138828701+ron-unstructured@users.noreply.github.com> Date: Tue, 20 May 2025 09:54:53 -0700 Subject: [PATCH 17/40] Replace Serverless API to Platform announcement on README page (#4003) Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> --- README.md | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 8fe773cd85..881ae55226 100644 --- a/README.md +++ b/README.md @@ -39,14 +39,9 @@ The `unstructured` library provides open-source components for ingesting and pre-processing images and text documents, such as PDFs, HTML, Word docs, and [many more](https://docs.unstructured.io/open-source/core-functionality/partitioning). The use cases of `unstructured` revolve around streamlining and optimizing the data processing workflow for LLMs. `unstructured` modular functions and connectors form a cohesive system that simplifies data ingestion and pre-processing, making it adaptable to different platforms and efficient in transforming unstructured data into structured outputs. -## Try the Unstructured Serverless API! - -Looking for better pre-processing performance and less setup? -Check out our new [Serverless API](https://unstructured.io/api-key-hosted)! -The Unstructured Serverless API is our most performant API yet, delivering a more responsive, -production-grade solution to better support your business and LLM needs. -Head to our [signup page](https://app.unstructured.io/) page to get started for -free. +## Try the Unstructured Platform Product + +Ready to move your data processing pipeline to production, and take advantage of advanced features? Check out [Unstructured Platform](https://unstructured.io/enterprise). In addition to better processing performance, take advantage of chunking, embedding, and image and table enrichment generation, all from a low code UI or an API. [Request a demo](https://unstructured.io/contact) from our sales team to learn more about how to get started. ## :eight_pointed_black_star: Quick Start From e42884a566e520e2378e172669cd0faf4f4b2225 Mon Sep 17 00:00:00 2001 From: Emmanuel Ferdman Date: Thu, 22 May 2025 20:53:42 +0300 Subject: [PATCH 18/40] fix: resolve warnings of logger library (#3999) # PR Summary This PR resolves the deprecation warnings of the `logger` library: ```python DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead ``` --------- Signed-off-by: Emmanuel Ferdman Co-authored-by: cragwolfe --- CHANGELOG.md | 1 + unstructured/partition/api.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4169fa951c..94c2d5e2d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ ### Fixes - **Fix image extraction for PNG files.** When `extract_image_block_to_payload` is True, and the image is a PNG, we get a Pillow error. We need to remove the PNG transparency layer before saving the image. +- **Fix logger deprecation warning**: Replaced usage of `logger.warn` with `logger.warning` to comply with the Python logging standards. - **Throw validation error when json is passed with invalid unstructured json ## 0.17.6 diff --git a/unstructured/partition/api.py b/unstructured/partition/api.py index ee5ad4da8a..b5463f6364 100644 --- a/unstructured/partition/api.py +++ b/unstructured/partition/api.py @@ -86,7 +86,7 @@ def partition_via_api( if file_filename is not None: metadata_filename = file_filename - logger.warn( + logger.warning( "The file_filename kwarg will be deprecated in a future version of unstructured. " "Please use metadata_filename instead.", ) @@ -277,7 +277,7 @@ def partition_multiple_via_api( if file_filenames is not None: metadata_filenames = file_filenames - logger.warn( + logger.warning( "The file_filenames kwarg will be deprecated in a future version of unstructured. " "Please use metadata_filenames instead.", ) From 3a048a5a02e0e374dc5ef317f5037f72618f6d62 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Mon, 2 Jun 2025 15:21:17 -0700 Subject: [PATCH 19/40] chore: script to verify unstructured image outbound connectivity (#4008) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sample output. The key thing here is the modes `offline` (meaning set HF_HUB_ONLINE=1 AND DO_NOT_TRACK=true) results in no outbound connections. This also is true if the locally cached models are removed, the last scenario of `offline-and-missing-models`) ``` $ ./test-all-outbound-connectivity-scenarios.sh >>> Removing leftover sut_* containers… Container: 543ac4b14370a18d790a2035e206e8c445754b825ec8b2887f4246f7404299c7 (scenario baseline) tcpdump running on interface eth0... >>> Running Python workload (capturing stdout/stderr)… [INFO] partitioning /app/example-docs/ideas-page.html Python finished. Log saved to /r/unstructured/scripts/image/python-output/offline-and-missing-models.log pcap saved to /r/unstructured/scripts/image/pcaps/offline-and-missing-models.pcap ================================================================== ======================================== Begin Scenario: baseline ------------------------------------------- tshark output for baseline ------------------------------------------- IPv4 Conversations Filter: | <- | | -> | | Total | Relative | Duration | | Frames Bytes | | Frames Bytes | | Frames Bytes | Start | | 172.18.0.2 <-> 108.138.246.79 20 12 kB 20 4,176 bytes 40 16 kB 2.531247000 69.0419 172.18.0.2 <-> 3.214.154.119 11 5,777 bytes 12 2,656 bytes 23 8,433 bytes 0.029451000 0.4118 172.18.0.2 <-> 192.168.65.5 2 656 bytes 2 158 bytes 4 814 bytes 0.000000000 2.5310 ------------------------------------------ python log output for baseline ------------------------------------------ [INFO] partitioning /app/example-docs/ideas-page.html [INFO] partitioning /app/example-docs/category-level.docx [INFO] partitioning /app/example-docs/fake_table.docx [INFO] partitioning /app/example-docs/img/english-and-korean.png 2025-06-02 22:05:02,265 - matplotlib.font_manager - INFO - generated new fontManager 2025-06-02 22:05:02,356 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443 2025-06-02 22:05:02,497 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /unstructuredio/yolo_x_layout/resolve/main/yolox_l0.05.onnx HTTP/1.1" 302 0 2025-06-02 22:05:02,613 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/english-and-korean.png ... 2025-06-02 22:05:04,792 - unstructured_inference - INFO - Loading the Table agent ... 2025-06-02 22:05:04,792 - unstructured_inference - INFO - Loading the table structure model ... 2025-06-02 22:05:04,877 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /microsoft/table-transformer-structure-recognition/resolve/main/config.json HTTP/1.1" 200 0 2025-06-02 22:05:04,960 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /microsoft/table-transformer-structure-recognition/resolve/main/config.json HTTP/1.1" 200 0 2025-06-02 22:05:04,970 - timm.models._builder - INFO - Loading pretrained weights from Hugging Face hub (timm/resnet18.a1_in1k) 2025-06-02 22:05:05,062 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /timm/resnet18.a1_in1k/resolve/main/model.safetensors HTTP/1.1" 302 0 2025-06-02 22:05:05,065 - timm.models._hub - INFO - [timm/resnet18.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors. 2025-06-02 22:05:05,071 - timm.models._builder - INFO - Missing keys (fc.weight, fc.bias) discovered while loading pretrained weights. This is expected if model is being adapted. [INFO] partitioning /app/example-docs/img/embedded-images-tables.jpg 2025-06-02 22:05:05,152 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/embedded-images-tables.jpg ... [INFO] partitioning /app/example-docs/img/layout-parser-paper-with-table.jpg 2025-06-02 22:05:07,693 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/layout-parser-paper-with-table.jpg ... [INFO] partitioning /app/example-docs/pdf/embedded-images-tables.pdf 2025-06-02 22:05:12,706 - pikepdf._core - INFO - pikepdf C++ to Python logger bridge initialized 2025-06-02 22:05:12,733 - unstructured_inference - INFO - Reading PDF for file: /app/example-docs/pdf/embedded-images-tables.pdf ... [INFO] partitioning /app/example-docs/pdf/all-number-table.pdf 2025-06-02 22:05:15,251 - unstructured_inference - INFO - Reading PDF for file: /app/example-docs/pdf/all-number-table.pdf ... [INFO] partitioning /app/example-docs/fake-power-point.pptx [INFO] partitioning /app/example-docs/stanley-cups.xlsx [INFO] partitioning /app/example-docs/fake-email-multiple-attachments.msg 2025-06-02 22:05:16,936 - unstructured_inference - INFO - Reading image file: /tmp/tmplkanlou1/unstructured_logo.png ... 2025-06-02 22:05:18,749 - unstructured_inference - INFO - Reading PDF for file: /tmp/tmpxdzdouhb/dense_doc.pdf ... ================================================================== ======================================== Begin Scenario: missing-models ------------------------------------------- tshark output for missing-models ------------------------------------------- IPv4 Conversations Filter: | <- | | -> | | Total | Relative | Duration | | Frames Bytes | | Frames Bytes | | Frames Bytes | Start | | 172.18.0.2 <-> 18.155.192.23 181834 273 MB 33502 1,813 kB 215336 275 MB 2.704106000 75.2880 172.18.0.2 <-> 3.168.86.41 79696 119 MB 15234 825 kB 94930 120 MB 9.066044000 68.9276 172.18.0.2 <-> 108.138.246.85 29 21 kB 25 5,760 bytes 54 27 kB 2.431857000 75.5633 172.18.0.2 <-> 3.214.154.119 12 5,831 bytes 12 2,656 bytes 24 8,487 bytes 0.016604000 0.3590 172.18.0.2 <-> 192.168.65.5 4 1,084 bytes 4 314 bytes 8 1,398 bytes 0.000000000 9.0651 ------------------------------------------ python log output for missing-models ------------------------------------------ [INFO] partitioning /app/example-docs/ideas-page.html [INFO] partitioning /app/example-docs/category-level.docx [INFO] partitioning /app/example-docs/fake_table.docx [INFO] partitioning /app/example-docs/img/english-and-korean.png 2025-06-02 22:06:30,961 - matplotlib.font_manager - INFO - generated new fontManager 2025-06-02 22:06:31,046 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443 2025-06-02 22:06:31,300 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /unstructuredio/yolo_x_layout/resolve/main/yolox_l0.05.onnx HTTP/1.1" 302 0 2025-06-02 22:06:31,310 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): cdn-lfs.hf.co:443 2025-06-02 22:06:31,439 - urllib3.connectionpool - DEBUG - https://cdn-lfs.hf.co:443 "GET /repos/d9/51/d951593388d0af1cb4a029c311ba19f9b05090d9acc4606c2b82588297ea4397/134301ca94fb0df8027be9a6dad1908fe6218af8ffa4d34f0819c7c2226195f3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27yolox_l0.05.onnx%3B+filename%3D%22yolox_l0.05.onnx%22%3B&Expires=1748904676&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0ODkwNDY3Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9kOS81MS9kOTUxNTkzMzg4ZDBhZjFjYjRhMDI5YzMxMWJhMTlmOWIwNTA5MGQ5YWNjNDYwNmMyYjgyNTg4Mjk3ZWE0Mzk3LzEzNDMwMWNhOTRmYjBkZjgwMjdiZTlhNmRhZDE5MDhmZTYyMThhZjhmZmE0ZDM0ZjA4MTljN2MyMjI2MTk1ZjM~cmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=hxvwTzJynEvyE~UuirlH~L4c5Gc6rGksDp~Uw94ooayDrzshE2sDdHmvqgoQyzqxHHhZLjfiJlAGUtVO7nVAHSoqt8mH7H9yN51Zj5UGqI-odXtW1dmWCD3i7nwwNlrEEjlXlERkIScpIjpkJDnjwhzeE94l1s7gysIm8c6J8JTcDlsdMver5wAVrBtLSVUrDN8PC84xgOGerHVhX7-eZcUVG2OAIJHoB3s2gLPkW9aVM5fvCmmoXMPI9oCvgLUp-zhXv3cWHh~yURuY1ufoI4CFG5ogW8nV~V45qLlbRw9PrvfFoLS-wxBGDOhT3SRWVOJzRRmACByABGWYMXRFuw__&Key-Pair-Id=K3RPWS32NSSJCE HTTP/1.1" 200 216625723 2025-06-02 22:06:35,019 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/english-and-korean.png ... 2025-06-02 22:06:37,188 - unstructured_inference - INFO - Loading the Table agent ... 2025-06-02 22:06:37,188 - unstructured_inference - INFO - Loading the table structure model ... 2025-06-02 22:06:37,290 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /microsoft/table-transformer-structure-recognition/resolve/main/config.json HTTP/1.1" 200 0 2025-06-02 22:06:37,375 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "GET /microsoft/table-transformer-structure-recognition/resolve/main/config.json HTTP/1.1" 200 1469 2025-06-02 22:06:37,484 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /microsoft/table-transformer-structure-recognition/resolve/main/config.json HTTP/1.1" 200 0 2025-06-02 22:06:37,581 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /microsoft/table-transformer-structure-recognition/resolve/main/model.safetensors HTTP/1.1" 302 0 Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet` 2025-06-02 22:06:37,586 - huggingface_hub.file_download - WARNING - Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet` 2025-06-02 22:06:37,681 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "GET /microsoft/table-transformer-structure-recognition/resolve/main/model.safetensors HTTP/1.1" 302 1319 2025-06-02 22:06:37,685 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): cas-bridge.xethub.hf.co:443 2025-06-02 22:06:37,778 - urllib3.connectionpool - DEBUG - https://cas-bridge.xethub.hf.co:443 "GET /xet-bridge-us/634929bd8146350b3a4cadaf/e78778928a1863786d5bb22a109a7ff1dbac47a29eae6f223a1fc2689172c347?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250602%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250602T220637Z&X-Amz-Expires=3600&X-Amz-Signature=c0a361e8982b1b05ee443054646b438e5a68d6767ef6df03dad6c5db20d0bdc5&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1748905597&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0ODkwNTU5N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MzQ5MjliZDgxNDYzNTBiM2E0Y2FkYWYvZTc4Nzc4OTI4YTE4NjM3ODZkNWJiMjJhMTA5YTdmZjFkYmFjNDdhMjllYWU2ZjIyM2ExZmMyNjg5MTcyYzM0NyoifV19&Signature=cRjZe56uJ8vxmmgRhPmp7XZX69PHKoXO9XN1bfq5n~84Vxz~HvCmg6MqtuUAFIiOWAHFhOuVzJpoiWTYT1JdZrtMeQTdywnZM-lIIn5Q45kzr8q8C58yvLz7vmKKrD9pOnGjJPaVavYYxEDdlAXbWf6xo433kKF4TfmQ9z7UIKt~M-XV9EdPUUBNhByucLVcTZ3sec5DqI4FmzK28fdJ1BMD4NyDjWW6hi~Lp2V3bW0FLCpI6qKGuikJ3E-OVcJDdDvZAqSN0-GoQyHIP9kp4RTqPBb7jekpZ3Uj91UWEmGx6YNuNlorAMGi61hrL6mAUUmW13OGua2vcJyk9LxZQg__&Key-Pair-Id=K2L8F4GPSG1IFC HTTP/1.1" 200 115434268 2025-06-02 22:06:39,612 - timm.models._builder - INFO - Loading pretrained weights from Hugging Face hub (timm/resnet18.a1_in1k) 2025-06-02 22:06:39,696 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /timm/resnet18.a1_in1k/resolve/main/model.safetensors HTTP/1.1" 302 0 2025-06-02 22:06:39,714 - urllib3.connectionpool - DEBUG - https://cdn-lfs.hf.co:443 "GET /repos/42/d5/42d585781e0b74854ae52a1bc2a63d09896f1d70f86bff969f4c053508d6c2d6/80c49dee3da4822c009c5a7fe591e9223c5a2cfcf95a4067ca4dfb5a7b89c612?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1748904665&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0ODkwNDY2NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy80Mi9kNS80MmQ1ODU3ODFlMGI3NDg1NGFlNTJhMWJjMmE2M2QwOTg5NmYxZDcwZjg2YmZmOTY5ZjRjMDUzNTA4ZDZjMmQ2LzgwYzQ5ZGVlM2RhNDgyMmMwMDljNWE3ZmU1OTFlOTIyM2M1YTJjZmNmOTVhNDA2N2NhNGRmYjVhN2I4OWM2MTI~cmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=GL15CLiGsmHno-DP25kfcuObjbrjd~ir5C5xapGqb9lda~5Wjy-3axBPftr1xWUnKh24Ay0mS49U8ZOcEdQxmzxQ97HiSX0-8s0-H187hV6mId6uxsULOGkNtjpkMKhfxe0qIfAmfi9gxl9JdiVfG5367HfPDVST8NvGPqMuKYoywSNWA-Uby-L9qb~EjtxbH9v1H2g6C0i9t2mn8ghD8BtTWEn4LY9c4O5bI~EQatNToNjsQTKa18LzXEowZnODLSLkyE7beLzfEpuTX9vlDzcAwKCPp-1M3xMZI4tzR-yfzyGhW19wqc6BVncUw53WSK7oOCv56HmFTYHhzOE-eQ__&Key-Pair-Id=K3RPWS32NSSJCE HTTP/1.1" 200 46807446 2025-06-02 22:06:40,394 - timm.models._hub - INFO - [timm/resnet18.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors. 2025-06-02 22:06:40,396 - timm.models._builder - INFO - Missing keys (fc.weight, fc.bias) discovered while loading pretrained weights. This is expected if model is being adapted. [INFO] partitioning /app/example-docs/img/embedded-images-tables.jpg 2025-06-02 22:06:40,460 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/embedded-images-tables.jpg ... [INFO] partitioning /app/example-docs/img/layout-parser-paper-with-table.jpg 2025-06-02 22:06:42,985 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/layout-parser-paper-with-table.jpg ... [INFO] partitioning /app/example-docs/pdf/embedded-images-tables.pdf 2025-06-02 22:06:48,019 - pikepdf._core - INFO - pikepdf C++ to Python logger bridge initialized 2025-06-02 22:06:48,045 - unstructured_inference - INFO - Reading PDF for file: /app/example-docs/pdf/embedded-images-tables.pdf ... [INFO] partitioning /app/example-docs/pdf/all-number-table.pdf 2025-06-02 22:06:50,557 - unstructured_inference - INFO - Reading PDF for file: /app/example-docs/pdf/all-number-table.pdf ... [INFO] partitioning /app/example-docs/fake-power-point.pptx [INFO] partitioning /app/example-docs/stanley-cups.xlsx [INFO] partitioning /app/example-docs/fake-email-multiple-attachments.msg 2025-06-02 22:06:52,358 - unstructured_inference - INFO - Reading image file: /tmp/tmpsha4r586/unstructured_logo.png ... 2025-06-02 22:06:54,199 - unstructured_inference - INFO - Reading PDF for file: /tmp/tmpg_5lk06v/dense_doc.pdf ... ================================================================== ======================================== Begin Scenario: analytics-online-only ------------------------------------------- tshark output for analytics-online-only ------------------------------------------- IPv4 Conversations Filter: | <- | | -> | | Total | Relative | Duration | | Frames Bytes | | Frames Bytes | | Frames Bytes | Start | | 172.18.0.2 <-> 54.236.224.89 12 5,831 bytes 12 2,656 bytes 24 8,487 bytes 0.032536000 0.3535 172.18.0.2 <-> 192.168.65.5 1 462 bytes 1 84 bytes 2 546 bytes 0.000000000 0.0322 ------------------------------------------ python log output for analytics-online-only ------------------------------------------ [INFO] partitioning /app/example-docs/ideas-page.html [INFO] partitioning /app/example-docs/category-level.docx [INFO] partitioning /app/example-docs/fake_table.docx [INFO] partitioning /app/example-docs/img/english-and-korean.png 2025-06-02 22:08:10,114 - matplotlib.font_manager - INFO - generated new fontManager 2025-06-02 22:08:10,320 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/english-and-korean.png ... 2025-06-02 22:08:12,470 - unstructured_inference - INFO - Loading the Table agent ... 2025-06-02 22:08:12,470 - unstructured_inference - INFO - Loading the table structure model ... 2025-06-02 22:08:12,475 - timm.models._builder - INFO - Loading pretrained weights from Hugging Face hub (timm/resnet18.a1_in1k) 2025-06-02 22:08:12,476 - timm.models._hub - INFO - [timm/resnet18.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors. 2025-06-02 22:08:12,478 - timm.models._builder - INFO - Missing keys (fc.weight, fc.bias) discovered while loading pretrained weights. This is expected if model is being adapted. [INFO] partitioning /app/example-docs/img/embedded-images-tables.jpg 2025-06-02 22:08:12,548 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/embedded-images-tables.jpg ... [INFO] partitioning /app/example-docs/img/layout-parser-paper-with-table.jpg 2025-06-02 22:08:15,102 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/layout-parser-paper-with-table.jpg ... [INFO] partitioning /app/example-docs/pdf/embedded-images-tables.pdf 2025-06-02 22:08:20,163 - pikepdf._core - INFO - pikepdf C++ to Python logger bridge initialized 2025-06-02 22:08:20,189 - unstructured_inference - INFO - Reading PDF for file: /app/example-docs/pdf/embedded-images-tables.pdf ... [INFO] partitioning /app/example-docs/pdf/all-number-table.pdf 2025-06-02 22:08:22,732 - unstructured_inference - INFO - Reading PDF for file: /app/example-docs/pdf/all-number-table.pdf ... [INFO] partitioning /app/example-docs/fake-power-point.pptx [INFO] partitioning /app/example-docs/stanley-cups.xlsx [INFO] partitioning /app/example-docs/fake-email-multiple-attachments.msg 2025-06-02 22:08:24,468 - unstructured_inference - INFO - Reading image file: /tmp/tmp4oud0ctq/unstructured_logo.png ... 2025-06-02 22:08:26,297 - unstructured_inference - INFO - Reading PDF for file: /tmp/tmpv24idrvu/dense_doc.pdf ... ================================================================== ======================================== Begin Scenario: offline ------------------------------------------- tshark output for offline ------------------------------------------- IPv4 Conversations Filter: | <- | | -> | | Total | Relative | Duration | | Frames Bytes | | Frames Bytes | | Frames Bytes | Start | | ------------------------------------------ python log output for offline ------------------------------------------ [INFO] partitioning /app/example-docs/ideas-page.html [INFO] partitioning /app/example-docs/category-level.docx [INFO] partitioning /app/example-docs/fake_table.docx [INFO] partitioning /app/example-docs/img/english-and-korean.png 2025-06-02 22:09:37,826 - matplotlib.font_manager - INFO - generated new fontManager 2025-06-02 22:09:38,028 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/english-and-korean.png ... 2025-06-02 22:09:40,188 - unstructured_inference - INFO - Loading the Table agent ... 2025-06-02 22:09:40,188 - unstructured_inference - INFO - Loading the table structure model ... 2025-06-02 22:09:40,193 - timm.models._builder - INFO - Loading pretrained weights from Hugging Face hub (timm/resnet18.a1_in1k) 2025-06-02 22:09:40,193 - timm.models._hub - INFO - [timm/resnet18.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors. 2025-06-02 22:09:40,195 - timm.models._builder - INFO - Missing keys (fc.weight, fc.bias) discovered while loading pretrained weights. This is expected if model is being adapted. [INFO] partitioning /app/example-docs/img/embedded-images-tables.jpg 2025-06-02 22:09:40,260 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/embedded-images-tables.jpg ... [INFO] partitioning /app/example-docs/img/layout-parser-paper-with-table.jpg 2025-06-02 22:09:42,810 - unstructured_inference - INFO - Reading image file: /app/example-docs/img/layout-parser-paper-with-table.jpg ... [INFO] partitioning /app/example-docs/pdf/embedded-images-tables.pdf 2025-06-02 22:09:47,851 - pikepdf._core - INFO - pikepdf C++ to Python logger bridge initialized 2025-06-02 22:09:47,877 - unstructured_inference - INFO - Reading PDF for file: /app/example-docs/pdf/embedded-images-tables.pdf ... [INFO] partitioning /app/example-docs/pdf/all-number-table.pdf 2025-06-02 22:09:50,475 - unstructured_inference - INFO - Reading PDF for file: /app/example-docs/pdf/all-number-table.pdf ... [INFO] partitioning /app/example-docs/fake-power-point.pptx [INFO] partitioning /app/example-docs/stanley-cups.xlsx [INFO] partitioning /app/example-docs/fake-email-multiple-attachments.msg 2025-06-02 22:09:52,181 - unstructured_inference - INFO - Reading image file: /tmp/tmpn3rraz6o/unstructured_logo.png ... 2025-06-02 22:09:54,032 - unstructured_inference - INFO - Reading PDF for file: /tmp/tmpvbqk645u/dense_doc.pdf ... ================================================================== ======================================== Begin Scenario: offline-and-missing-models ------------------------------------------- tshark output for offline-and-missing-models ------------------------------------------- IPv4 Conversations Filter: | <- | | -> | | Total | Relative | Duration | | Frames Bytes | | Frames Bytes | | Frames Bytes | Start | | ------------------------------------------ python log output for offline-and-missing-models ------------------------------------------ [INFO] partitioning /app/example-docs/ideas-page.html [INFO] partitioning /app/example-docs/category-level.docx [INFO] partitioning /app/example-docs/fake_table.docx [INFO] partitioning /app/example-docs/img/english-and-korean.png 2025-06-02 22:11:05,743 - matplotlib.font_manager - INFO - generated new fontManager Traceback (most recent call last): File "/home/notebook-user/.local/lib/python3.11/site-packages/huggingface_hub/file_download.py", line 1484, in _get_metadata_or_catch_error metadata = get_hf_file_metadata( ^^^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn return fn(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/huggingface_hub/file_download.py", line 1401, in get_hf_file_metadata r = _request_wrapper( ^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/huggingface_hub/file_download.py", line 285, in _request_wrapper response = _request_wrapper( ^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/huggingface_hub/file_download.py", line 308, in _request_wrapper response = get_session().request(method=method, url=url, **params) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/requests/sessions.py", line 589, in request resp = self.send(prep, **send_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/requests/sessions.py", line 703, in send r = adapter.send(request, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/huggingface_hub/utils/_http.py", line 107, in send raise OfflineModeIsEnabled( huggingface_hub.errors.OfflineModeIsEnabled: Cannot reach https://huggingface.co/unstructuredio/yolo_x_layout/resolve/main/yolox_l0.05.onnx: offline mode is enabled. To disable it, please unset the `HF_HUB_OFFLINE` environment variable. The above exception was the direct cause of the following exception: Traceback (most recent call last): File "", line 35, in File "/app/unstructured/partition/auto.py", line 231, in partition elements = partition_image( ^^^^^^^^^^^^^^^^ File "/app/unstructured/documents/elements.py", line 585, in wrapper elements = func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/app/unstructured/file_utils/filetype.py", line 774, in wrapper elements = func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/app/unstructured/chunking/dispatch.py", line 74, in wrapper elements = func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/app/unstructured/partition/image.py", line 102, in partition_image return partition_pdf_or_image( ^^^^^^^^^^^^^^^^^^^^^^^ File "/app/unstructured/partition/pdf.py", line 341, in partition_pdf_or_image elements = _partition_pdf_or_image_local( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/app/unstructured/utils.py", line 216, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/app/unstructured/partition/pdf.py", line 649, in _partition_pdf_or_image_local inferred_document_layout = process_file_with_model( ^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured_inference/inference/layout.py", line 371, in process_file_with_model model = get_model(model_name, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured_inference/models/base.py", line 74, in get_model model.initialize(**initialize_params) File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured_inference/utils.py", line 40, in __getitem__ value = evaluate(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/unstructured_inference/utils.py", line 115, in download_if_needed_and_get_local_path return hf_hub_download(path_or_repo, filename, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn return fn(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/huggingface_hub/file_download.py", line 961, in hf_hub_download return _hf_hub_download_to_cache_dir( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/notebook-user/.local/lib/python3.11/site-packages/huggingface_hub/file_download.py", line 1068, in _hf_hub_download_to_cache_dir _raise_on_head_call_error(head_call_error, force_download, local_files_only) File "/home/notebook-user/.local/lib/python3.11/site-packages/huggingface_hub/file_download.py", line 1599, in _raise_on_head_call_error raise LocalEntryNotFoundError( huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. ``` --- .gitignore | 2 + ...est-all-outbound-connectivity-scenarios.sh | 61 ++++++ scripts/image/test-outbound-connectivity.sh | 183 ++++++++++++++++++ 3 files changed, 246 insertions(+) create mode 100755 scripts/image/test-all-outbound-connectivity-scenarios.sh create mode 100755 scripts/image/test-outbound-connectivity.sh diff --git a/.gitignore b/.gitignore index 87f4fc72bd..c938151357 100644 --- a/.gitignore +++ b/.gitignore @@ -210,3 +210,5 @@ metricsdiff.txt # analysis annotated/ .aider* +pcaps +python-output diff --git a/scripts/image/test-all-outbound-connectivity-scenarios.sh b/scripts/image/test-all-outbound-connectivity-scenarios.sh new file mode 100755 index 0000000000..fb5208b5a5 --- /dev/null +++ b/scripts/image/test-all-outbound-connectivity-scenarios.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +# Note: +# +# The scenarios baseline, missing-models, and analytics-online-only +# are expected to have conversations reported by tshark +# +# The scenarios offline and offline-and-missing-models +# are *NOT* expected to have any conversations (or attempted conversations) reported by tshark + +set -euo pipefail + +# shellcheck disable=SC2015 +((BASH_VERSINFO[0] >= 5)) || { + echo "Requires bash >= 5" >&2 + exit 1 +} + +mkdir -p python-output +mkdir -p pcaps + +start_timestamp_seconds=$(date +%s) + +./test-outbound-connectivity.sh --cleanup baseline +./test-outbound-connectivity.sh --cleanup missing-models +./test-outbound-connectivity.sh --cleanup analytics-online-only +./test-outbound-connectivity.sh --cleanup offline +./test-outbound-connectivity.sh --cleanup offline-and-missing-models + +set +e +found_pcap_files=$(find "pcaps" -maxdepth 1 -name "*.pcap" -type f -newermt "@$start_timestamp_seconds" 2>/dev/null | wc -l | tr -d ' ') +found_log_files=$(find "python-output" -maxdepth 1 -name "*.log" -type f -newermt "@$start_timestamp_seconds" 2>/dev/null | wc -l | tr -d ' ') +set -e +if [ "$found_pcap_files" -ne "5" ]; then + echo "Expected to find 4 fresh pcap/ files from this test but found $found_pcap_files instead" + exit 1 +fi +if [ "$found_log_files" -ne "5" ]; then + echo "Expected to find 4 fresh python-output .log files from this test but found $found_log_files instead" + exit 1 +fi + +for scenario in baseline missing-models analytics-online-only offline offline-and-missing-models; do + echo + + echo "==================================================================" + echo "======================================== Begin Scenario: $scenario" + echo + echo " -------------------------------------------" + echo " tshark output for $scenario" + echo " -------------------------------------------" + echo + tshark -r pcaps/$scenario.pcap -q -z conv,ip | grep -v '====================================' + + echo + echo " ------------------------------------------" + echo " python log output for $scenario" + echo " ------------------------------------------" + echo + cat python-output/$scenario.log +done diff --git a/scripts/image/test-outbound-connectivity.sh b/scripts/image/test-outbound-connectivity.sh new file mode 100755 index 0000000000..82eda50385 --- /dev/null +++ b/scripts/image/test-outbound-connectivity.sh @@ -0,0 +1,183 @@ +#!/usr/bin/env bash +# +# test-outbound-connectivity.sh +# +# Capture every external packet an Unstructured Docker image emits while +# partition()‑ing a test PNG, *inside the same container* (works on macOS). +# +# In addition **also capture the Python workload's stdout / stderr** and save it +# under ./python-output/.log while still streaming it to your terminal. +# +# Usage examples +# ./test-outbound-connectivity.sh baseline +# ./test-outbound-connectivity.sh --cleanup missing-models +# ./test-outbound-connectivity.sh --cleanup offline +# ./test-outbound-connectivity.sh offline-and-missing-models +# +# Outputs: +# ./pcaps/.pcap +# ./python-output/.log +# --------------------------------------------------------------------- + +set -euo pipefail + +######################## user‑tunable constants ######################## +IMAGE="downloads.unstructured.io/unstructured-io/unstructured:e42884a" +NET="unstructured_test_net" +CAPTURE_IFACE="${CAPTURE_IFACE:-eth0}" +PCAP_DIR="$(pwd)/pcaps" +PY_LOG_DIR="$(pwd)/python-output" # where Python logs go +HF_CACHE="/home/notebook-user/.cache/huggingface" +######################################################################## + +# shellcheck disable=SC2015 +((BASH_VERSINFO[0] >= 5)) || { + echo "Requires bash >= 5" >&2 + exit 1 +} + +# Create output directories up‑front so failures don’t leave us empty‑handed +mkdir -p "$PCAP_DIR" "$PY_LOG_DIR" + +# ---------- parse flags (optional --cleanup) -------------------------- +CLEANUP=0 +if [[ "${1:-}" == "--cleanup" ]]; then + CLEANUP=1 + shift +fi + +SCENARIO="${1:-}" +if [[ -z "$SCENARIO" ]]; then + echo "Usage: $0 [--cleanup] {baseline|missing-models|offline|offline-and-missing-models}" >&2 + exit 1 +fi + +# ---------- optional pre‑run cleanup ---------------------------------- +if ((CLEANUP)); then + echo ">>> Removing leftover sut_* containers…" + # shellcheck disable=SC2015 + docker rm -f "$(docker ps -aq --filter name='^sut_')" 2>/dev/null || true +fi + +# ---------- scenario‑specific settings -------------------------------- +DO_NOT_TRACK="" +HF_HUB_OFFLINE="" +REMOVE_CACHE=0 +case "$SCENARIO" in +baseline) ;; +missing-models) REMOVE_CACHE=1 ;; +analytics-online-only) HF_HUB_OFFLINE=1 ;; +offline) + DO_NOT_TRACK=true + HF_HUB_OFFLINE=1 + ;; +offline-and-missing-models) + DO_NOT_TRACK=true + HF_HUB_OFFLINE=1 + REMOVE_CACHE=1 + ;; +*) + echo "Unknown scenario: $SCENARIO" + exit 1 + ;; +esac + +docker network inspect "$NET" >/dev/null 2>&1 || docker network create "$NET" + +# ---------- launch SUT idle ------------------------------------------- +CID=$(docker run -d --rm --name "sut_${SCENARIO}" \ + --network "$NET" \ + --cap-add NET_RAW --cap-add NET_ADMIN \ + -e DO_NOT_TRACK="$DO_NOT_TRACK" \ + -e HF_HUB_OFFLINE="$HF_HUB_OFFLINE" \ + --entrypoint /bin/sh "$IMAGE" -c "sleep infinity") +echo "Container: $CID (scenario $SCENARIO)" + +# install tcpdump (Wolfi uses apk) as root +docker exec -u root "$CID" apk add --no-cache tcpdump >/dev/null + +# optionally wipe HF cache +# shellcheck disable=SC2015 +((REMOVE_CACHE)) && docker exec "$CID" rm -rf "$HF_CACHE" || true + +# ---------- start tcpdump in background ------------------------------- +FILTER='not (dst net ff02::/16 or src net ff02::/16 or ip6[6] = 58 or ether multicast)' + +docker exec -u root -d "$CID" sh -c "tcpdump -U -n -i $CAPTURE_IFACE '$FILTER' -w /tmp/capture.pcap > /tmp/tcpdump.log 2>&1" + +# check if tcpdump stayed alive +sleep 2 +if ! docker exec "$CID" pgrep tcpdump >/dev/null; then + echo 'tcpdump exited – showing its log:' + docker exec "$CID" cat /tmp/tcpdump.log + exit 1 +fi + +echo "tcpdump running on interface $CAPTURE_IFACE..." +# ---------- run the Python workload ----------------------------------- +echo ">>> Running Python workload (capturing stdout/stderr)…" +# ‑ The "|&" pipes *both* stdout *and* stderr into tee. +# ‑ tee sends it to the terminal *and* writes the log file. +# ‑ With `set -o pipefail` we still fail early if the Python process exits non‑zero. + +if [[ "$HF_HUB_OFFLINE" -eq 1 && "$REMOVE_CACHE" -eq 1 ]]; then + echo "HF_HUB_OFFLINE=1 and REMOVE_CACHE=1 : allowing python command have a non-exit 0 status and will continue the script." + set +e +fi + +docker exec -i -e PYTHONUNBUFFERED=1 "$CID" python - </dev/null From a7e90f79908012e5ec90c108c9ee05649a1da75e Mon Sep 17 00:00:00 2001 From: luke-kucing Date: Wed, 4 Jun 2025 14:52:58 -0400 Subject: [PATCH 20/40] resolve CVEs and HF issue (#4009) update reqs to resolve CVEs and add the HF ENV to stop it from reaching out updated the Dockerfile with ENV HF_HUB_OFFLINE=1 to stop it from pinging HF. This was an issue for a gov customer. and updated requirements to resolve some open CVEs --------- Co-authored-by: cragwolfe Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: luke-kucing --- CHANGELOG.md | 9 +++ Dockerfile | 2 + requirements/base.txt | 17 +++--- requirements/dev.txt | 10 ++-- requirements/extra-docx.txt | 2 +- requirements/extra-markdown.txt | 4 +- requirements/extra-odt.txt | 2 +- requirements/extra-paddleocr.txt | 27 +++++---- requirements/extra-pdf-image.txt | 57 +++++++++++------- requirements/extra-pptx.txt | 2 +- requirements/huggingface.txt | 16 ++--- requirements/test.txt | 35 ++++++----- test_unstructured/cleaners/test_translate.py | 9 +++ .../layout-parser-paper-with-table.jpg.html | 2 +- .../layout-parser-paper.pdf.html | 59 +++++++++++-------- .../layout-parser-paper-with-table.jpg.json | 2 +- .../layout-parser-paper.pdf.json | 4 +- unstructured/__version__.py | 2 +- 18 files changed, 158 insertions(+), 103 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94c2d5e2d1..268675c4b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.7 + +### Enhancements +- **Updated Docker file with ENV HF_HUB_OFFLINE=1 to prevent the contianer from trying to access the internet + +### Features + +### Fixes + ## 0.17.7-dev0 ### Enhancements diff --git a/Dockerfile b/Dockerfile index e4d7ebd5be..7fc6666e5b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,4 +31,6 @@ RUN find requirements/ -type f -name "*.txt" ! -name "test.txt" ! -name "dev.txt $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \ $PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')" +ENV HF_HUB_OFFLINE=1 + CMD ["/bin/bash"] diff --git a/requirements/base.txt b/requirements/base.txt index 862ed52ff9..f9dce43aa8 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -20,7 +20,7 @@ cffi==1.17.1 # via cryptography chardet==5.2.0 # via -r ./base.in -charset-normalizer==3.4.1 +charset-normalizer==3.4.2 # via # requests # unstructured-client @@ -28,17 +28,17 @@ click==8.1.8 # via # nltk # python-oxmsg -cryptography==44.0.2 +cryptography==45.0.3 # via unstructured-client dataclasses-json==0.6.7 # via # -r ./base.in # unstructured-client -deepdiff==8.4.2 +deepdiff==8.5.0 # via unstructured-client emoji==2.14.1 # via -r ./base.in -exceptiongroup==1.2.2 +exceptiongroup==1.3.0 # via anyio filetype==1.2.0 # via -r ./base.in @@ -56,7 +56,7 @@ idna==3.10 # httpx # requests # unstructured-client -joblib==1.4.2 +joblib==1.5.1 # via nltk jsonpath-python==1.0.6 # via unstructured-client @@ -80,7 +80,7 @@ numpy==2.0.2 # via -r ./base.in olefile==0.47 # via python-oxmsg -orderly-set==5.4.0 +orderly-set==5.4.1 # via deepdiff packaging==25.0 # via @@ -90,7 +90,7 @@ psutil==7.0.0 # via -r ./base.in pycparser==2.22 # via cffi -pypdf==5.4.0 +pypdf==5.6.0 # via unstructured-client python-dateutil==2.9.0.post0 # via unstructured-client @@ -125,11 +125,12 @@ tqdm==4.67.1 # via # -r ./base.in # nltk -typing-extensions==4.13.2 +typing-extensions==4.14.0 # via # -r ./base.in # anyio # beautifulsoup4 + # exceptiongroup # pypdf # python-oxmsg # typing-inspect diff --git a/requirements/dev.txt b/requirements/dev.txt index b42ff70e01..632e7e299f 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -17,9 +17,9 @@ distlib==0.3.9 # via virtualenv filelock==3.18.0 # via virtualenv -identify==2.6.10 +identify==2.6.12 # via pre-commit -importlib-metadata==8.6.1 +importlib-metadata==8.7.0 # via # -c ././deps/constraints.txt # build @@ -32,7 +32,7 @@ packaging==25.0 # build pip-tools==7.4.1 # via -r ./dev.in -platformdirs==4.3.7 +platformdirs==4.3.8 # via # -c ./test.txt # virtualenv @@ -49,11 +49,11 @@ tomli==2.2.1 # -c ./test.txt # build # pip-tools -virtualenv==20.30.0 +virtualenv==20.31.2 # via pre-commit wheel==0.45.1 # via pip-tools -zipp==3.21.0 +zipp==3.22.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index f31b78b82a..831f636e57 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -10,7 +10,7 @@ lxml==5.4.0 # python-docx python-docx==1.1.2 # via -r ./extra-docx.in -typing-extensions==4.13.2 +typing-extensions==4.14.0 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 2311bce60f..bcdf3368f8 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -4,11 +4,11 @@ # # pip-compile ./extra-markdown.in # -importlib-metadata==8.6.1 +importlib-metadata==8.7.0 # via # -c ././deps/constraints.txt # markdown markdown==3.8 # via -r ./extra-markdown.in -zipp==3.21.0 +zipp==3.22.0 # via importlib-metadata diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index ced65cd542..94bd199821 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -12,7 +12,7 @@ pypandoc==1.15 # via -r ./extra-odt.in python-docx==1.1.2 # via -r ./extra-odt.in -typing-extensions==4.13.2 +typing-extensions==4.14.0 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index df43fc8f9b..4c7ce73d3f 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -4,11 +4,11 @@ # # pip-compile ./extra-paddleocr.in # -albucore==0.0.23 +albucore==0.0.24 # via # albumentations # unstructured-paddleocr -albumentations==2.0.5 +albumentations==2.0.8 # via unstructured-paddleocr annotated-types==0.7.0 # via pydantic @@ -28,23 +28,23 @@ certifi==2025.4.26 # httpcore # httpx # requests -charset-normalizer==3.4.1 +charset-normalizer==3.4.2 # via # -c ./base.txt # requests -cython==3.0.12 +cython==3.1.1 # via unstructured-paddleocr decorator==5.2.1 # via paddlepaddle eval-type-backport==0.2.2 # via albumentations -exceptiongroup==1.2.2 +exceptiongroup==1.3.0 # via # -c ./base.txt # anyio fire==0.7.0 # via unstructured-paddleocr -fonttools==4.57.0 +fonttools==4.58.1 # via unstructured-paddleocr h11==0.16.0 # via @@ -115,15 +115,15 @@ pillow==11.2.1 # paddlepaddle # scikit-image # unstructured-paddleocr -protobuf==6.30.2 +protobuf==6.31.1 # via # -c ././deps/constraints.txt # paddlepaddle pyclipper==1.3.0.post6 # via unstructured-paddleocr -pydantic==2.11.3 +pydantic==2.11.5 # via albumentations -pydantic-core==2.33.1 +pydantic-core==2.33.2 # via pydantic python-docx==1.1.2 # via unstructured-paddleocr @@ -147,7 +147,7 @@ scipy==1.13.1 # scikit-image shapely==2.0.7 # via unstructured-paddleocr -simsimd==6.2.1 +simsimd==6.4.7 # via albucore sniffio==1.3.1 # via @@ -159,7 +159,7 @@ soupsieve==2.7 # beautifulsoup4 stringzilla==3.12.5 # via albucore -termcolor==3.0.1 +termcolor==3.1.0 # via fire tifffile==2024.8.30 # via scikit-image @@ -167,19 +167,20 @@ tqdm==4.67.1 # via # -c ./base.txt # unstructured-paddleocr -typing-extensions==4.13.2 +typing-extensions==4.14.0 # via # -c ./base.txt # albucore # albumentations # anyio # beautifulsoup4 + # exceptiongroup # paddlepaddle # pydantic # pydantic-core # python-docx # typing-inspection -typing-inspection==0.4.0 +typing-inspection==0.4.1 # via pydantic unstructured-paddleocr==2.10.0 # via -r ./extra-paddleocr.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 367924c7d6..8371ad8f8f 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -4,6 +4,8 @@ # # pip-compile ./extra-pdf-image.in # +accelerate==1.7.0 + # via unstructured-inference antlr4-python3-runtime==4.9.3 # via omegaconf cachetools==5.5.2 @@ -16,7 +18,7 @@ cffi==1.17.1 # via # -c ./base.txt # cryptography -charset-normalizer==3.4.1 +charset-normalizer==3.4.2 # via # -c ./base.txt # pdfminer-six @@ -25,7 +27,7 @@ coloredlogs==15.0.1 # via onnxruntime contourpy==1.3.0 # via matplotlib -cryptography==44.0.2 +cryptography==45.0.3 # via # -c ./base.txt # pdfminer-six @@ -42,15 +44,15 @@ filelock==3.18.0 # transformers flatbuffers==25.2.10 # via onnxruntime -fonttools==4.57.0 +fonttools==4.58.1 # via matplotlib -fsspec==2025.3.2 +fsspec==2025.5.1 # via # huggingface-hub # torch -google-api-core[grpc]==2.24.2 +google-api-core[grpc]==2.25.0 # via google-cloud-vision -google-auth==2.39.0 +google-auth==2.40.2 # via # google-api-core # google-cloud-vision @@ -60,15 +62,18 @@ googleapis-common-protos==1.70.0 # via # google-api-core # grpcio-status -grpcio==1.71.0 +grpcio==1.72.1 # via # -c ././deps/constraints.txt # google-api-core # grpcio-status -grpcio-status==1.62.3 +grpcio-status==1.72.1 # via google-api-core -huggingface-hub==0.30.2 +hf-xet==1.1.2 + # via huggingface-hub +huggingface-hub==0.32.3 # via + # accelerate # timm # tokenizers # transformers @@ -92,9 +97,7 @@ lxml==5.4.0 markupsafe==3.0.2 # via jinja2 matplotlib==3.9.4 - # via - # pycocotools - # unstructured-inference + # via unstructured-inference mpmath==1.3.0 # via sympy networkx==3.2.1 @@ -102,6 +105,7 @@ networkx==3.2.1 numpy==2.0.2 # via # -c ./base.txt + # accelerate # contourpy # matplotlib # onnx @@ -115,7 +119,7 @@ numpy==2.0.2 # unstructured-inference omegaconf==2.3.0 # via effdet -onnx==1.17.0 +onnx==1.18.0 # via # -r ./extra-pdf-image.in # unstructured-inference @@ -128,6 +132,7 @@ opencv-python==4.11.0.86 packaging==25.0 # via # -c ./base.txt + # accelerate # huggingface-hub # matplotlib # onnxruntime @@ -145,7 +150,7 @@ pdfminer-six==20250327 # unstructured-inference pi-heif==0.22.0 # via -r ./extra-pdf-image.in -pikepdf==9.7.0 +pikepdf==9.8.1 # via -r ./extra-pdf-image.in pillow==11.2.1 # via @@ -159,7 +164,7 @@ proto-plus==1.26.1 # via # google-api-core # google-cloud-vision -protobuf==6.30.2 +protobuf==6.31.1 # via # -c ././deps/constraints.txt # google-api-core @@ -169,13 +174,17 @@ protobuf==6.30.2 # onnx # onnxruntime # proto-plus +psutil==7.0.0 + # via + # -c ./base.txt + # accelerate pyasn1==0.6.1 # via # pyasn1-modules # rsa pyasn1-modules==0.4.2 # via google-auth -pycocotools==2.0.8 +pycocotools==2.0.9 # via effdet pycparser==2.22 # via @@ -183,7 +192,7 @@ pycparser==2.22 # cffi pyparsing==3.2.3 # via matplotlib -pypdf==5.4.0 +pypdf==5.6.0 # via # -c ./base.txt # -r ./extra-pdf-image.in @@ -200,6 +209,7 @@ pytz==2025.2 # via pandas pyyaml==6.0.2 # via + # accelerate # huggingface-hub # omegaconf # timm @@ -222,6 +232,7 @@ rsa==4.9.1 # via google-auth safetensors==0.5.3 # via + # accelerate # timm # transformers scipy==1.13.1 @@ -230,7 +241,7 @@ six==1.17.0 # via # -c ./base.txt # python-dateutil -sympy==1.13.3 +sympy==1.14.0 # via # onnxruntime # torch @@ -244,6 +255,7 @@ tokenizers==0.21.1 # transformers torch==2.7.0 # via + # accelerate # effdet # timm # torchvision @@ -257,17 +269,18 @@ tqdm==4.67.1 # -c ./base.txt # huggingface-hub # transformers -transformers==4.51.3 +transformers==4.52.4 # via unstructured-inference -typing-extensions==4.13.2 +typing-extensions==4.14.0 # via # -c ./base.txt # huggingface-hub + # onnx # pypdf # torch tzdata==2025.2 # via pandas -unstructured-inference==0.8.10 +unstructured-inference==1.0.2 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.15 # via -r ./extra-pdf-image.in @@ -280,5 +293,5 @@ wrapt==1.17.2 # via # -c ./base.txt # deprecated -zipp==3.21.0 +zipp==3.22.0 # via importlib-resources diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 7ec19718d8..1664e9f404 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -10,7 +10,7 @@ pillow==11.2.1 # via python-pptx python-pptx==1.0.2 # via -r ./extra-pptx.in -typing-extensions==4.13.2 +typing-extensions==4.14.0 # via python-pptx xlsxwriter==3.2.3 # via python-pptx diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index a7c793c739..61b710ea62 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -8,7 +8,7 @@ certifi==2025.4.26 # via # -c ./base.txt # requests -charset-normalizer==3.4.1 +charset-normalizer==3.4.2 # via # -c ./base.txt # requests @@ -21,11 +21,13 @@ filelock==3.18.0 # huggingface-hub # torch # transformers -fsspec==2025.3.2 +fsspec==2025.5.1 # via # huggingface-hub # torch -huggingface-hub==0.30.2 +hf-xet==1.1.2 + # via huggingface-hub +huggingface-hub==0.32.3 # via # tokenizers # transformers @@ -35,7 +37,7 @@ idna==3.10 # requests jinja2==3.1.6 # via torch -joblib==1.4.2 +joblib==1.5.1 # via # -c ./base.txt # sacremoses @@ -82,7 +84,7 @@ six==1.17.0 # via # -c ./base.txt # langdetect -sympy==1.13.3 +sympy==1.14.0 # via torch tokenizers==0.21.1 # via @@ -96,9 +98,9 @@ tqdm==4.67.1 # huggingface-hub # sacremoses # transformers -transformers==4.51.3 +transformers==4.52.4 # via -r ./huggingface.in -typing-extensions==4.13.2 +typing-extensions==4.14.0 # via # -c ./base.txt # huggingface-hub diff --git a/requirements/test.txt b/requirements/test.txt index 2706ac725c..ce0fd2cc62 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -14,11 +14,11 @@ click==8.1.8 # via # -c ./base.txt # black -coverage[toml]==7.8.0 +coverage[toml]==7.8.2 # via # -r ./test.in # pytest-cov -exceptiongroup==1.2.2 +exceptiongroup==1.3.0 # via # -c ./base.txt # pytest @@ -28,9 +28,9 @@ flake8==7.2.0 # flake8-print flake8-print==5.0.0 # via -r ./test.in -freezegun==1.5.1 +freezegun==1.5.2 # via -r ./test.in -grpcio==1.71.0 +grpcio==1.72.1 # via # -c ././deps/constraints.txt # -r ./test.in @@ -40,7 +40,7 @@ liccheck==0.9.2 # via -r ./test.in mccabe==0.7.0 # via flake8 -mypy==1.15.0 +mypy==1.16.0 # via -r ./test.in mypy-extensions==1.1.0 # via @@ -53,36 +53,40 @@ packaging==25.0 # black # pytest pathspec==0.12.1 + # via + # black + # mypy +platformdirs==4.3.8 # via black -platformdirs==4.3.7 - # via black -pluggy==1.5.0 +pluggy==1.6.0 # via pytest pycodestyle==2.13.0 # via # flake8 # flake8-print -pydantic==2.11.3 +pydantic==2.11.5 # via -r ./test.in -pydantic-core==2.33.1 +pydantic-core==2.33.2 # via pydantic pyflakes==3.3.2 # via # autoflake # flake8 -pytest==8.3.5 +pygments==2.19.1 + # via pytest +pytest==8.4.0 # via # pytest-cov # pytest-mock pytest-cov==6.1.1 # via -r ./test.in -pytest-mock==3.14.0 +pytest-mock==3.14.1 # via -r ./test.in python-dateutil==2.9.0.post0 # via # -c ./base.txt # freezegun -ruff==0.11.7 +ruff==0.11.12 # via -r ./test.in semantic-version==2.10.0 # via liccheck @@ -109,13 +113,14 @@ types-tabulate==0.9.0.20241207 # via -r ./test.in types-urllib3==1.26.25.14 # via types-requests -typing-extensions==4.13.2 +typing-extensions==4.14.0 # via # -c ./base.txt # black + # exceptiongroup # mypy # pydantic # pydantic-core # typing-inspection -typing-inspection==0.4.0 +typing-inspection==0.4.1 # via pydantic diff --git a/test_unstructured/cleaners/test_translate.py b/test_unstructured/cleaners/test_translate.py index 7b54c61d27..7dcfe88e76 100644 --- a/test_unstructured/cleaners/test_translate.py +++ b/test_unstructured/cleaners/test_translate.py @@ -1,7 +1,11 @@ +import os + import pytest from unstructured.cleaners import translate +IS_CI = os.getenv("CI") == "true" + def test_get_opus_mt_model_name(): model_name = translate._get_opus_mt_model_name("ru", "en") @@ -24,27 +28,32 @@ def test_translate_returns_same_text_text_is_empty(): assert translate.translate_text(text) == text +@pytest.mark.skipif(IS_CI, reason="Skipping this test in CI pipeline") def test_translate_with_language_specified(): text = "Ich bin ein Berliner!" assert translate.translate_text(text, "de") == "I'm a Berliner!" +@pytest.mark.skipif(IS_CI, reason="Skipping this test in CI pipeline") def test_translate_with_no_language_specified(): text = "Ich bin ein Berliner!" assert translate.translate_text(text) == "I'm a Berliner!" +@pytest.mark.skipif(IS_CI, reason="Skipping this test in CI pipeline") def test_translate_raises_with_bad_language(): text = "Ich bin ein Berliner!" with pytest.raises(ValueError): translate.translate_text(text, "zz") +@pytest.mark.skipif(IS_CI, reason="Skipping this test in CI pipeline") def test_tranlate_works_with_russian(): text = "Đ¯ Ñ‚Đ¾Đ¶Đµ Đ¼Đ¾Đ¶Đ½Đ¾ Đ¿ĐµÑ€ĐµĐ²Đ¾Đ´Đ°Ñ‚ÑŒ руÑÑĐºĐ¸Đ¹ ÑĐ·Ñ‹Đº!" assert translate.translate_text(text) == "I can also translate Russian!" +@pytest.mark.skipif(IS_CI, reason="Skipping this test in CI pipeline") def test_translate_works_with_chinese(): text = "網站有中ă€è‹±æ–‡ç‰ˆæœ¬" translate.translate_text(text) == "Website available in Chinese and English" diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html index dbf342486a..f34c22d7f1 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html @@ -26,7 +26,7 @@

    Large Model - | Notes + Notes diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html index eca4025c8d..de08f35556 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html @@ -168,21 +168,33 @@

    Dataset - | Base Model'| + | - | Notes + Base Model'| + + + Large Model | + + + Notes - PubLayNet B8]| + PubLayNet + + + B8]| F/M + + M + Layouts of modern scientific documents @@ -191,9 +203,14 @@

    PRImA + + M + + - + Layouts of scanned modern magazines and scientific report @@ -202,9 +219,14 @@

    Newspaper + + F + + - + Layouts of scanned US newspapers from the 20th century @@ -213,6 +235,11 @@

    TableBank + + + + F + F @@ -224,9 +251,14 @@

    HJDataset + + F/M + + - + Layouts of history Japanese documents @@ -316,10 +348,7 @@

    - block.pad(top, bottom, - - - right, + block.pad(top, bottom, right, left) @@ -336,8 +365,6 @@

    - - Scale the current block given the ratio in x and y direction @@ -348,8 +375,6 @@

    - - Move the current block with the shift distances in x and y direction @@ -360,8 +385,6 @@

    - - Whether block] is inside of block2 @@ -372,8 +395,6 @@

    - - Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs @@ -384,8 +405,6 @@

    - - Return the union region of blockl and block2. Coordinate type to be determined based on the inputs @@ -396,8 +415,6 @@

    - - Convert the absolute coordinates of block to relative coordinates to block2 @@ -408,8 +425,6 @@

    - - Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates @@ -420,8 +435,6 @@

    - - Obtain the image segments in the block region diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json index b0354dcb4a..096983591e 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json @@ -48,7 +48,7 @@ "element_id": "dddac446da6c93dc1449ecb5d997c423", "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century ‘TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents", "metadata": { - "text_as_html": "
    Dataset| Base Model!|Large Model| Notes
    PubLayNet [33]P/MMLayouts of modern scientific documents
    PRImA [3]MLayouts of scanned modern magazines and scientific reports
    Newspaper [17]PLayouts of scanned US newspapers from the 20th century
    TableBank [18]PTable region on modern scientific and business document
    HIDataset [31]P/MLayouts of history Japanese documents
    ", + "text_as_html": "
    Dataset| Base Model!|Large ModelNotes
    PubLayNet [33]P/MMLayouts of modern scientific documents
    PRImA [3]MLayouts of scanned modern magazines and scientific reports
    Newspaper [17]PLayouts of scanned US newspapers from the 20th century
    TableBank [18]PTable region on modern scientific and business document
    HIDataset [31]P/MLayouts of history Japanese documents
    ", "filetype": "image/jpeg", "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 7c0e7324d2..9917fc45d6 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -1459,7 +1459,7 @@ "start_index": 65 } ], - "text_as_html": "
    Dataset| Base Model'|| Notes
    PubLayNet B8]|F/MLayouts of modern scientific documents
    PRImAMLayouts of scanned modern magazines and scientific report
    NewspaperFLayouts of scanned US newspapers from the 20th century
    TableBankFTable region on modern scientific and business document
    HJDatasetF/MLayouts of history Japanese documents
    ", + "text_as_html": "
    Dataset|Base Model'|Large Model |Notes
    PubLayNetB8]|F/MMLayouts of modern scientific documents
    PRImAM-Layouts of scanned modern magazines and scientific report
    NewspaperF-Layouts of scanned US newspapers from the 20th century
    TableBankFFTable region on modern scientific and business document
    HJDatasetF/M-Layouts of history Japanese documents
    ", "filetype": "application/pdf", "languages": [ "eng" @@ -2153,7 +2153,7 @@ "element_id": "64bc79d1132a89c71837f420d6e4e2dc", "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block.crop image(image) Obtain the image segments in the block region", "metadata": { - "text_as_html": "
    block.pad(top, bottom,right,left)Enlarge the current block according to the input
    block.scale(fx, fy)Scale the current block given the ratio in x and y direction
    block.shift(dx, dy)Move the current block with the shift distances in x and y direction
    block1.is_in(block2)Whether block] is inside of block2
    block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
    block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates
    block. crop_image (image)Obtain the image segments in the block region
    ", + "text_as_html": "
    block.pad(top, bottom, right,left)Enlarge the current block according to the input
    block.scale(fx, fy)Scale the current block given the ratio in x and y direction
    block.shift(dx, dy)Move the current block with the shift distances in x and y direction
    block1.is_in(block2)Whether block] is inside of block2
    block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
    block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates
    block. crop_image (image)Obtain the image segments in the block region
    ", "filetype": "application/pdf", "languages": [ "eng" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d53993104e..dfac37e9d6 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.7-dev0" # pragma: no cover +__version__ = "0.17.7" # pragma: no cover From 37d2f021a39b1b5682a5306349e2d36a6a1076a3 Mon Sep 17 00:00:00 2001 From: Yao You Date: Fri, 6 Jun 2025 04:52:17 -0500 Subject: [PATCH 21/40] Feat/bump inference (#4013) Bump `unstructured-inference` to `1.0.5`, which includes fix to ensure model init is thread safe. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet --- CHANGELOG.md | 7 ++- requirements/extra-csv.txt | 2 +- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 16 ++--- requirements/extra-xlsx.txt | 2 +- requirements/huggingface.txt | 6 +- .../layout-parser-paper-with-table.jpg.html | 2 +- .../layout-parser-paper.pdf.html | 59 ++++++++----------- .../layout-parser-paper-with-table.jpg.json | 2 +- .../layout-parser-paper.pdf.json | 4 +- unstructured/__version__.py | 2 +- 11 files changed, 46 insertions(+), 58 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 268675c4b9..00e6f9c4c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,16 @@ -## 0.17.7 +## 0.17.8 ### Enhancements -- **Updated Docker file with ENV HF_HUB_OFFLINE=1 to prevent the contianer from trying to access the internet +- **Bump `unstructured-inference` to `1.0.5`** It includes critical fix to ensure inference model initialization is thread safe ### Features ### Fixes -## 0.17.7-dev0 +## 0.17.7 ### Enhancements +- **Updated Docker file with ENV HF_HUB_OFFLINE=1 to prevent the contianer from trying to access the internet ### Features diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index 51885ae7ad..74e069b2de 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -8,7 +8,7 @@ numpy==2.0.2 # via # -c ./base.txt # pandas -pandas==2.2.3 +pandas==2.3.0 # via -r ./extra-csv.in python-dateutil==2.9.0.post0 # via diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 4f3aef930d..b0caffbb95 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -12,5 +12,5 @@ google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference>=0.8.10 +unstructured-inference>=1.0.5 unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 8371ad8f8f..ccec76a09a 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -52,7 +52,7 @@ fsspec==2025.5.1 # torch google-api-core[grpc]==2.25.0 # via google-cloud-vision -google-auth==2.40.2 +google-auth==2.40.3 # via # google-api-core # google-cloud-vision @@ -69,9 +69,9 @@ grpcio==1.72.1 # grpcio-status grpcio-status==1.72.1 # via google-api-core -hf-xet==1.1.2 +hf-xet==1.1.3 # via huggingface-hub -huggingface-hub==0.32.3 +huggingface-hub==0.32.4 # via # accelerate # timm @@ -139,7 +139,7 @@ packaging==25.0 # pikepdf # transformers # unstructured-pytesseract -pandas==2.2.3 +pandas==2.3.0 # via unstructured-inference pdf2image==1.17.0 # via -r ./extra-pdf-image.in @@ -184,7 +184,7 @@ pyasn1==0.6.1 # rsa pyasn1-modules==0.4.2 # via google-auth -pycocotools==2.0.9 +pycocotools==2.0.10 # via effdet pycparser==2.22 # via @@ -253,14 +253,14 @@ tokenizers==0.21.1 # via # -c ././deps/constraints.txt # transformers -torch==2.7.0 +torch==2.7.1 # via # accelerate # effdet # timm # torchvision # unstructured-inference -torchvision==0.22.0 +torchvision==0.22.1 # via # effdet # timm @@ -280,7 +280,7 @@ typing-extensions==4.14.0 # torch tzdata==2025.2 # via pandas -unstructured-inference==1.0.2 +unstructured-inference==1.0.5 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.15 # via -r ./extra-pdf-image.in diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 937191502d..922e00bac0 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -14,7 +14,7 @@ numpy==2.0.2 # pandas openpyxl==3.1.5 # via -r ./extra-xlsx.in -pandas==2.2.3 +pandas==2.3.0 # via -r ./extra-xlsx.in python-dateutil==2.9.0.post0 # via diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 61b710ea62..e37ac6929c 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -25,9 +25,9 @@ fsspec==2025.5.1 # via # huggingface-hub # torch -hf-xet==1.1.2 +hf-xet==1.1.3 # via huggingface-hub -huggingface-hub==0.32.3 +huggingface-hub==0.32.4 # via # tokenizers # transformers @@ -90,7 +90,7 @@ tokenizers==0.21.1 # via # -c ././deps/constraints.txt # transformers -torch==2.7.0 +torch==2.7.1 # via -r ./huggingface.in tqdm==4.67.1 # via diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html index f34c22d7f1..dbf342486a 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html @@ -26,7 +26,7 @@

    Large Model - Notes + | Notes diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html index de08f35556..eca4025c8d 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html @@ -168,33 +168,21 @@

    Dataset - | + | Base Model'| - Base Model'| - - - Large Model | - - - Notes + | Notes - PubLayNet - - - B8]| + PubLayNet B8]| F/M - - M - Layouts of modern scientific documents @@ -203,14 +191,9 @@

    PRImA - - M - - - - Layouts of scanned modern magazines and scientific report @@ -219,14 +202,9 @@

    Newspaper - - F - - - - Layouts of scanned US newspapers from the 20th century @@ -235,11 +213,6 @@

    TableBank - - - - F - F @@ -251,14 +224,9 @@

    HJDataset - - F/M - - - - Layouts of history Japanese documents @@ -348,7 +316,10 @@

    - block.pad(top, bottom, right, + block.pad(top, bottom, + + + right, left) @@ -365,6 +336,8 @@

    + + Scale the current block given the ratio in x and y direction @@ -375,6 +348,8 @@

    + + Move the current block with the shift distances in x and y direction @@ -385,6 +360,8 @@

    + + Whether block] is inside of block2 @@ -395,6 +372,8 @@

    + + Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs @@ -405,6 +384,8 @@

    + + Return the union region of blockl and block2. Coordinate type to be determined based on the inputs @@ -415,6 +396,8 @@

    + + Convert the absolute coordinates of block to relative coordinates to block2 @@ -425,6 +408,8 @@

    + + Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates @@ -435,6 +420,8 @@

    + + Obtain the image segments in the block region diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json index 096983591e..b0354dcb4a 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json @@ -48,7 +48,7 @@ "element_id": "dddac446da6c93dc1449ecb5d997c423", "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century ‘TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents", "metadata": { - "text_as_html": "
    Dataset| Base Model!|Large ModelNotes
    PubLayNet [33]P/MMLayouts of modern scientific documents
    PRImA [3]MLayouts of scanned modern magazines and scientific reports
    Newspaper [17]PLayouts of scanned US newspapers from the 20th century
    TableBank [18]PTable region on modern scientific and business document
    HIDataset [31]P/MLayouts of history Japanese documents
    ", + "text_as_html": "
    Dataset| Base Model!|Large Model| Notes
    PubLayNet [33]P/MMLayouts of modern scientific documents
    PRImA [3]MLayouts of scanned modern magazines and scientific reports
    Newspaper [17]PLayouts of scanned US newspapers from the 20th century
    TableBank [18]PTable region on modern scientific and business document
    HIDataset [31]P/MLayouts of history Japanese documents
    ", "filetype": "image/jpeg", "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 9917fc45d6..7c0e7324d2 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -1459,7 +1459,7 @@ "start_index": 65 } ], - "text_as_html": "
    Dataset|Base Model'|Large Model |Notes
    PubLayNetB8]|F/MMLayouts of modern scientific documents
    PRImAM-Layouts of scanned modern magazines and scientific report
    NewspaperF-Layouts of scanned US newspapers from the 20th century
    TableBankFFTable region on modern scientific and business document
    HJDatasetF/M-Layouts of history Japanese documents
    ", + "text_as_html": "
    Dataset| Base Model'|| Notes
    PubLayNet B8]|F/MLayouts of modern scientific documents
    PRImAMLayouts of scanned modern magazines and scientific report
    NewspaperFLayouts of scanned US newspapers from the 20th century
    TableBankFTable region on modern scientific and business document
    HJDatasetF/MLayouts of history Japanese documents
    ", "filetype": "application/pdf", "languages": [ "eng" @@ -2153,7 +2153,7 @@ "element_id": "64bc79d1132a89c71837f420d6e4e2dc", "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block.crop image(image) Obtain the image segments in the block region", "metadata": { - "text_as_html": "
    block.pad(top, bottom, right,left)Enlarge the current block according to the input
    block.scale(fx, fy)Scale the current block given the ratio in x and y direction
    block.shift(dx, dy)Move the current block with the shift distances in x and y direction
    block1.is_in(block2)Whether block] is inside of block2
    block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
    block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates
    block. crop_image (image)Obtain the image segments in the block region
    ", + "text_as_html": "
    block.pad(top, bottom,right,left)Enlarge the current block according to the input
    block.scale(fx, fy)Scale the current block given the ratio in x and y direction
    block.shift(dx, dy)Move the current block with the shift distances in x and y direction
    block1.is_in(block2)Whether block] is inside of block2
    block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
    block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates
    block. crop_image (image)Obtain the image segments in the block region
    ", "filetype": "application/pdf", "languages": [ "eng" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index dfac37e9d6..6cbe7445c5 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.7" # pragma: no cover +__version__ = "0.17.8" # pragma: no cover From 06e4e54f5c76e19a0a1a4440077853d1e538631a Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 18:38:43 -0700 Subject: [PATCH 22/40] Bump requests to address CVEs (#4015) --- CHANGELOG.md | 3 +++ Dockerfile | 2 +- requirements/base.txt | 2 +- requirements/extra-paddleocr.txt | 2 +- requirements/extra-pdf-image.txt | 2 +- requirements/huggingface.txt | 2 +- unstructured/__version__.py | 2 +- 7 files changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00e6f9c4c1..79143c8413 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 0.17.9 +- Patch various CVEs + ## 0.17.8 ### Enhancements diff --git a/Dockerfile b/Dockerfile index 7fc6666e5b..c9c2e69e23 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ COPY test_unstructured test_unstructured COPY example-docs example-docs RUN chown -R notebook-user:notebook-user /app && \ - apk add font-ubuntu git && \ + apk add --no-cache font-ubuntu fontconfig git && \ fc-cache -fv && \ [ -e /usr/bin/python3 ] || ln -s /usr/bin/$PYTHON /usr/bin/python3 diff --git a/requirements/base.txt b/requirements/base.txt index f9dce43aa8..0daee868f6 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -104,7 +104,7 @@ rapidfuzz==3.13.0 # via -r ./base.in regex==2024.11.6 # via nltk -requests==2.32.3 +requests==2.32.4 # via # -r ./base.in # requests-toolbelt diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 4c7ce73d3f..16502d6bbb 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -135,7 +135,7 @@ rapidfuzz==3.13.0 # via # -c ./base.txt # unstructured-paddleocr -requests==2.32.3 +requests==2.32.4 # via # -c ./base.txt # unstructured-paddleocr diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index ccec76a09a..22597ecfe6 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -222,7 +222,7 @@ regex==2024.11.6 # via # -c ./base.txt # transformers -requests==2.32.3 +requests==2.32.4 # via # -c ./base.txt # google-api-core diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index e37ac6929c..a5d50c8cc5 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -69,7 +69,7 @@ regex==2024.11.6 # -c ./base.txt # sacremoses # transformers -requests==2.32.3 +requests==2.32.4 # via # -c ./base.txt # huggingface-hub diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6cbe7445c5..6527bfeb22 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.8" # pragma: no cover +__version__ = "0.17.9" # pragma: no cover From b6ab471f009790f05aef53ad0edcdbe0f4d6dbb6 Mon Sep 17 00:00:00 2001 From: Emily Voss Date: Tue, 10 Jun 2025 23:32:11 -0700 Subject: [PATCH 23/40] Drop Python 3.9 support due to dependency conflicts (#4017) --- .github/workflows/ci.yml | 16 +++--- CHANGELOG.md | 5 ++ requirements/base.txt | 14 +++--- requirements/deps/constraints.txt | 2 +- requirements/dev.txt | 10 +--- requirements/extra-csv.txt | 4 +- requirements/extra-docx.txt | 2 +- requirements/extra-epub.txt | 2 +- requirements/extra-markdown.txt | 8 +-- requirements/extra-odt.txt | 2 +- requirements/extra-paddleocr.txt | 30 +++++------ requirements/extra-pandoc.txt | 2 +- requirements/extra-pdf-image.txt | 84 ++++++++++++++++++++++--------- requirements/extra-pptx.txt | 2 +- requirements/extra-xlsx.txt | 6 +-- requirements/huggingface.txt | 58 ++++++++++++++++++--- requirements/ingest/ingest.txt | 2 +- requirements/test.txt | 19 ++++--- scripts/pip-compile.sh | 2 +- unstructured/__version__.py | 2 +- 20 files changed, 172 insertions(+), 100 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 94e2d08612..52f1941a1e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,7 +19,7 @@ jobs: setup: strategy: matrix: - python-version: ["3.9","3.10","3.11", "3.12"] + python-version: ["3.10","3.11", "3.12"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -31,7 +31,7 @@ jobs: check-deps: strategy: matrix: - python-version: ["3.9","3.10","3.11", "3.12"] + python-version: ["3.10","3.11", "3.12"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -45,7 +45,7 @@ jobs: check-extras: strategy: matrix: - python-version: [ "3.9","3.10","3.11","3.12" ] + python-version: ["3.10","3.11","3.12"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -78,7 +78,7 @@ jobs: lint: strategy: matrix: - python-version: ["3.9","3.10","3.11"] + python-version: ["3.10","3.11"] runs-on: ubuntu-latest needs: [setup, changelog] steps: @@ -117,7 +117,7 @@ jobs: test_unit: strategy: matrix: - python-version: ["3.9","3.10","3.11", "3.12"] + python-version: ["3.10","3.11", "3.12"] runs-on: ubuntu-latest needs: [setup, lint] steps: @@ -224,7 +224,7 @@ jobs: setup_ingest: strategy: matrix: - python-version: [ "3.9","3.10" ] + python-version: ["3.10"] runs-on: ubuntu-latest needs: [setup] steps: @@ -237,7 +237,7 @@ jobs: test_ingest_src: strategy: matrix: - python-version: ["3.9","3.10"] + python-version: ["3.10"] runs-on: ubuntu-latest-m needs: [setup_ingest, lint] steps: @@ -323,7 +323,7 @@ jobs: test_json_to_html: strategy: matrix: - python-version: ["3.9","3.10"] + python-version: ["3.10"] runs-on: ubuntu-latest-m needs: [setup, lint] steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 79143c8413..2140662067 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.17.10 +- Drop Python 3.9 support as it reaches EOL in October 2025 +- Update pip-compile script to use Python 3.10 and newer +- Update all packages using pip-compile + ## 0.17.9 - Patch various CVEs diff --git a/requirements/base.txt b/requirements/base.txt index 0daee868f6..09b9a0bdf9 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./base.in @@ -24,11 +24,11 @@ charset-normalizer==3.4.2 # via # requests # unstructured-client -click==8.1.8 +click==8.2.1 # via # nltk # python-oxmsg -cryptography==45.0.3 +cryptography==45.0.4 # via unstructured-client dataclasses-json==0.6.7 # via @@ -76,7 +76,7 @@ nest-asyncio==1.6.0 # via unstructured-client nltk==3.9.1 # via -r ./base.in -numpy==2.0.2 +numpy==2.2.6 # via -r ./base.in olefile==0.47 # via python-oxmsg @@ -141,11 +141,11 @@ typing-inspect==0.9.0 # unstructured-client unstructured-client==0.25.9 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # -r ./base.in -urllib3==1.26.20 +urllib3==2.4.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # requests # unstructured-client webencodings==0.5.1 diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 9659e8bac1..88efdd5d2b 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -13,7 +13,7 @@ grpcio>=1.65.5 tokenizers>=0.21,<0.22 # TODO: Constaint due to boto, with python before 3.10 not requiring openssl 1.1.1, remove when that gets # updated or we drop support for 3.9 -urllib3<1.27 +urllib3<3.0.0 # TODO: Constriant due to aiobotocore, remove when that gets updates: botocore<1.34.132 # TODO: Constriant due to both 8.5.0 and 8.4.0 being installed during pip-compile diff --git a/requirements/dev.txt b/requirements/dev.txt index 632e7e299f..ac2fa3cb41 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./dev.in @@ -8,7 +8,7 @@ build==1.2.2.post1 # via pip-tools cfgv==3.4.0 # via pre-commit -click==8.1.8 +click==8.2.1 # via # -c ./base.txt # -c ./test.txt @@ -19,10 +19,6 @@ filelock==3.18.0 # via virtualenv identify==2.6.12 # via pre-commit -importlib-metadata==8.7.0 - # via - # -c ././deps/constraints.txt - # build nodeenv==1.9.1 # via pre-commit packaging==25.0 @@ -53,8 +49,6 @@ virtualenv==20.31.2 # via pre-commit wheel==0.45.1 # via pip-tools -zipp==3.22.0 - # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index 74e069b2de..d37c6a53dc 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -1,10 +1,10 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-csv.in # -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # pandas diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 831f636e57..b71b1cd6d7 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-docx.in diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index 460408c418..db97d6a9be 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-epub.in diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index bcdf3368f8..c0c3a476ad 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -1,14 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-markdown.in # -importlib-metadata==8.7.0 - # via - # -c ././deps/constraints.txt - # markdown markdown==3.8 # via -r ./extra-markdown.in -zipp==3.22.0 - # via importlib-metadata diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 94bd199821..4c92aae6cc 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-odt.in diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 16502d6bbb..9dce312d7b 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-paddleocr.in @@ -32,19 +32,17 @@ charset-normalizer==3.4.2 # via # -c ./base.txt # requests -cython==3.1.1 +cython==3.1.2 # via unstructured-paddleocr decorator==5.2.1 # via paddlepaddle -eval-type-backport==0.2.2 - # via albumentations exceptiongroup==1.3.0 # via # -c ./base.txt # anyio fire==0.7.0 # via unstructured-paddleocr -fonttools==4.58.1 +fonttools==4.58.2 # via unstructured-paddleocr h11==0.16.0 # via @@ -72,11 +70,11 @@ lxml==5.4.0 # via # -c ./base.txt # python-docx -networkx==3.2.1 +networkx==3.4.2 # via # paddlepaddle # scikit-image -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # albucore @@ -117,7 +115,7 @@ pillow==11.2.1 # unstructured-paddleocr protobuf==6.31.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # paddlepaddle pyclipper==1.3.0.post6 # via unstructured-paddleocr @@ -139,15 +137,15 @@ requests==2.32.4 # via # -c ./base.txt # unstructured-paddleocr -scikit-image==0.24.0 +scikit-image==0.25.2 # via unstructured-paddleocr -scipy==1.13.1 +scipy==1.15.3 # via # albumentations # scikit-image -shapely==2.0.7 +shapely==2.1.1 # via unstructured-paddleocr -simsimd==6.4.7 +simsimd==6.4.9 # via albucore sniffio==1.3.1 # via @@ -161,7 +159,7 @@ stringzilla==3.12.5 # via albucore termcolor==3.1.0 # via fire -tifffile==2024.8.30 +tifffile==2025.5.10 # via scikit-image tqdm==4.67.1 # via @@ -170,8 +168,6 @@ tqdm==4.67.1 typing-extensions==4.14.0 # via # -c ./base.txt - # albucore - # albumentations # anyio # beautifulsoup4 # exceptiongroup @@ -184,8 +180,8 @@ typing-inspection==0.4.1 # via pydantic unstructured-paddleocr==2.10.0 # via -r ./extra-paddleocr.in -urllib3==1.26.20 +urllib3==2.4.0 # via - # -c ././deps/constraints.txt # -c ./base.txt + # -c ./deps/constraints.txt # requests diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index dd397c3845..95e2170080 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-pandoc.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 22597ecfe6..5c9121dab4 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-pdf-image.in @@ -25,9 +25,9 @@ charset-normalizer==3.4.2 # requests coloredlogs==15.0.1 # via onnxruntime -contourpy==1.3.0 +contourpy==1.3.2 # via matplotlib -cryptography==45.0.3 +cryptography==45.0.4 # via # -c ./base.txt # pdfminer-six @@ -44,7 +44,7 @@ filelock==3.18.0 # transformers flatbuffers==25.2.10 # via onnxruntime -fonttools==4.58.1 +fonttools==4.58.2 # via matplotlib fsspec==2025.5.1 # via @@ -62,16 +62,16 @@ googleapis-common-protos==1.70.0 # via # google-api-core # grpcio-status -grpcio==1.72.1 +grpcio==1.73.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # google-api-core # grpcio-status -grpcio-status==1.72.1 +grpcio-status==1.73.0 # via google-api-core hf-xet==1.1.3 # via huggingface-hub -huggingface-hub==0.32.4 +huggingface-hub==0.32.5 # via # accelerate # timm @@ -84,11 +84,9 @@ idna==3.10 # via # -c ./base.txt # requests -importlib-resources==6.5.2 - # via matplotlib jinja2==3.1.6 # via torch -kiwisolver==1.4.7 +kiwisolver==1.4.8 # via matplotlib lxml==5.4.0 # via @@ -96,13 +94,13 @@ lxml==5.4.0 # pikepdf markupsafe==3.0.2 # via jinja2 -matplotlib==3.9.4 +matplotlib==3.10.3 # via unstructured-inference mpmath==1.3.0 # via sympy -networkx==3.2.1 +networkx==3.4.2 # via torch -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # accelerate @@ -117,13 +115,50 @@ numpy==2.0.2 # torchvision # transformers # unstructured-inference +nvidia-cublas-cu12==12.6.4.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.6.80 + # via torch +nvidia-cuda-nvrtc-cu12==12.6.77 + # via torch +nvidia-cuda-runtime-cu12==12.6.77 + # via torch +nvidia-cudnn-cu12==9.5.1.17 + # via torch +nvidia-cufft-cu12==11.3.0.4 + # via torch +nvidia-cufile-cu12==1.11.1.6 + # via torch +nvidia-curand-cu12==10.3.7.77 + # via torch +nvidia-cusolver-cu12==11.7.1.2 + # via torch +nvidia-cusparse-cu12==12.5.4.2 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-cusparselt-cu12==0.6.3 + # via torch +nvidia-nccl-cu12==2.26.2 + # via torch +nvidia-nvjitlink-cu12==12.6.85 + # via + # nvidia-cufft-cu12 + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.6.77 + # via torch omegaconf==2.3.0 # via effdet onnx==1.18.0 # via # -r ./extra-pdf-image.in # unstructured-inference -onnxruntime==1.19.2 +onnxruntime==1.22.0 # via # -r ./extra-pdf-image.in # unstructured-inference @@ -145,7 +180,7 @@ pdf2image==1.17.0 # via -r ./extra-pdf-image.in pdfminer-six==20250327 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # -r ./extra-pdf-image.in # unstructured-inference pi-heif==0.22.0 @@ -166,7 +201,7 @@ proto-plus==1.26.1 # google-cloud-vision protobuf==6.31.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # google-api-core # google-cloud-vision # googleapis-common-protos @@ -235,7 +270,7 @@ safetensors==0.5.3 # accelerate # timm # transformers -scipy==1.13.1 +scipy==1.15.3 # via unstructured-inference six==1.17.0 # via @@ -251,7 +286,7 @@ timm==1.0.15 # unstructured-inference tokenizers==0.21.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # transformers torch==2.7.1 # via @@ -271,6 +306,8 @@ tqdm==4.67.1 # transformers transformers==4.52.4 # via unstructured-inference +triton==3.3.1 + # via torch typing-extensions==4.14.0 # via # -c ./base.txt @@ -284,14 +321,15 @@ unstructured-inference==1.0.5 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.15 # via -r ./extra-pdf-image.in -urllib3==1.26.20 +urllib3==2.4.0 # via - # -c ././deps/constraints.txt # -c ./base.txt + # -c ./deps/constraints.txt # requests wrapt==1.17.2 # via # -c ./base.txt # deprecated -zipp==3.22.0 - # via importlib-resources + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 1664e9f404..d739fe0367 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-pptx.in diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 922e00bac0..e309ec0961 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -1,14 +1,14 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./extra-xlsx.in # et-xmlfile==2.0.0 # via openpyxl -networkx==3.2.1 +networkx==3.4.2 # via -r ./extra-xlsx.in -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # pandas diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index a5d50c8cc5..c9645c27f9 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./huggingface.in @@ -12,7 +12,7 @@ charset-normalizer==3.4.2 # via # -c ./base.txt # requests -click==8.1.8 +click==8.2.1 # via # -c ./base.txt # sacremoses @@ -27,7 +27,7 @@ fsspec==2025.5.1 # torch hf-xet==1.1.3 # via huggingface-hub -huggingface-hub==0.32.4 +huggingface-hub==0.32.5 # via # tokenizers # transformers @@ -49,12 +49,49 @@ markupsafe==3.0.2 # via jinja2 mpmath==1.3.0 # via sympy -networkx==3.2.1 +networkx==3.4.2 # via torch -numpy==2.0.2 +numpy==2.2.6 # via # -c ./base.txt # transformers +nvidia-cublas-cu12==12.6.4.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.6.80 + # via torch +nvidia-cuda-nvrtc-cu12==12.6.77 + # via torch +nvidia-cuda-runtime-cu12==12.6.77 + # via torch +nvidia-cudnn-cu12==9.5.1.17 + # via torch +nvidia-cufft-cu12==11.3.0.4 + # via torch +nvidia-cufile-cu12==1.11.1.6 + # via torch +nvidia-curand-cu12==10.3.7.77 + # via torch +nvidia-cusolver-cu12==11.7.1.2 + # via torch +nvidia-cusparse-cu12==12.5.4.2 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-cusparselt-cu12==0.6.3 + # via torch +nvidia-nccl-cu12==2.26.2 + # via torch +nvidia-nvjitlink-cu12==12.6.85 + # via + # nvidia-cufft-cu12 + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.6.77 + # via torch packaging==25.0 # via # -c ./base.txt @@ -88,7 +125,7 @@ sympy==1.14.0 # via torch tokenizers==0.21.1 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # transformers torch==2.7.1 # via -r ./huggingface.in @@ -100,13 +137,18 @@ tqdm==4.67.1 # transformers transformers==4.52.4 # via -r ./huggingface.in +triton==3.3.1 + # via torch typing-extensions==4.14.0 # via # -c ./base.txt # huggingface-hub # torch -urllib3==1.26.20 +urllib3==2.4.0 # via - # -c ././deps/constraints.txt # -c ./base.txt + # -c ./deps/constraints.txt # requests + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/requirements/ingest/ingest.txt b/requirements/ingest/ingest.txt index 364f499029..e817913f10 100644 --- a/requirements/ingest/ingest.txt +++ b/requirements/ingest/ingest.txt @@ -1,5 +1,5 @@ unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]>=0.2.1 s3fs>=2024.9.0 -urllib3>=1.26.20 +urllib3>=2.4.0 backoff>=2.2.1 httpx>=0.27.2 diff --git a/requirements/test.txt b/requirements/test.txt index ce0fd2cc62..7aba185049 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile ./test.in @@ -10,7 +10,7 @@ autoflake==2.3.1 # via -r ./test.in black==25.1.0 # via -r ./test.in -click==8.1.8 +click==8.2.1 # via # -c ./base.txt # black @@ -30,9 +30,9 @@ flake8-print==5.0.0 # via -r ./test.in freezegun==1.5.2 # via -r ./test.in -grpcio==1.72.1 +grpcio==1.73.0 # via - # -c ././deps/constraints.txt + # -c ./deps/constraints.txt # -r ./test.in iniconfig==2.1.0 # via pytest @@ -86,7 +86,7 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # freezegun -ruff==0.11.12 +ruff==0.11.13 # via -r ./test.in semantic-version==2.10.0 # via liccheck @@ -107,12 +107,10 @@ types-click==7.1.8 # via -r ./test.in types-markdown==3.8.0.20250415 # via -r ./test.in -types-requests==2.31.0.6 +types-requests==2.32.4.20250611 # via -r ./test.in types-tabulate==0.9.0.20241207 # via -r ./test.in -types-urllib3==1.26.25.14 - # via types-requests typing-extensions==4.14.0 # via # -c ./base.txt @@ -124,3 +122,8 @@ typing-extensions==4.14.0 # typing-inspection typing-inspection==0.4.1 # via pydantic +urllib3==2.4.0 + # via + # -c ./base.txt + # -c ./deps/constraints.txt + # types-requests diff --git a/scripts/pip-compile.sh b/scripts/pip-compile.sh index ece191698b..460e99733f 100755 --- a/scripts/pip-compile.sh +++ b/scripts/pip-compile.sh @@ -2,7 +2,7 @@ # python version must match lowest supported (3.9) major=3 -minor=9 +minor=10 if ! python -c "import sys; assert sys.version_info.major == $major and sys.version_info.minor == $minor"; then echo "python version not equal to expected $major.$minor: $(python --version)" exit 1 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6527bfeb22..1f320cd5fd 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.9" # pragma: no cover +__version__ = "0.17.10" # pragma: no cover From ec209c6b5f9f24b4aabfa3bc8145ab896e7afd66 Mon Sep 17 00:00:00 2001 From: Pluto Date: Wed, 11 Jun 2025 13:55:02 +0200 Subject: [PATCH 24/40] Remove IDs from HTML code (#4012) In this pull request parent-child relationship for elements generated with v2 parser is based on actual element IDs instead of IDs baked somewhere in the HTML script. With some extra bug fixing it allowed for significantly simplifying json -> HTML script --- CHANGELOG.md | 9 + scripts/html/rendered_html_from_elements.py | 64 +- .../documents/html_files/example.html | 46 +- .../html_files/example_full_doc.html | 1336 ++++++++--------- .../example_with_alternative_text.html | 10 +- .../example_with_inline_fields.html | 14 +- .../documents/html_files/three_tables.html | 8 +- .../unstructured_json_output/example.json | 20 +- .../example_full_doc.json | 186 +-- .../example_with_alternative_text.json | 8 +- .../example_with_inline_fields.json | 8 +- .../three_tables.json | 6 +- ...t_html_to_unstructured_and_back_parsing.py | 240 +-- ...structured_elements_to_ontology_parsing.py | 123 +- unstructured/__version__.py | 2 +- unstructured/documents/mappings.py | 16 +- unstructured/documents/ontology.py | 43 +- .../partition/html/transformations.py | 59 +- 18 files changed, 1098 insertions(+), 1100 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2140662067..15d307657c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.11-dev0 + +### Enhancements + +### Features + +### Fixes +- Invalid elements IDs are not visible in VLM output. Parent-child hierarchy is now retrieved based on unstructured element ID, instead of id injected into HTML code of element. + ## 0.17.10 - Drop Python 3.9 support as it reaches EOL in October 2025 - Update pip-compile script to use Python 3.10 and newer diff --git a/scripts/html/rendered_html_from_elements.py b/scripts/html/rendered_html_from_elements.py index 5789a83d14..019810e196 100644 --- a/scripts/html/rendered_html_from_elements.py +++ b/scripts/html/rendered_html_from_elements.py @@ -10,16 +10,12 @@ """ import argparse +import html import logging import os import select import sys -from collections import defaultdict -from typing import List, Sequence -from bs4 import BeautifulSoup - -from unstructured.documents import elements from unstructured.partition.html.transformations import unstructured_elements_to_ontology from unstructured.staging.base import elements_from_json @@ -28,48 +24,6 @@ logger = logging.getLogger(__name__) -def extract_document_div(html_content: str) -> str: - pos = html_content.find(">") - if pos != -1: - return html_content[: pos + 1] - logger.error("No '>' found in the HTML content.") - raise ValueError("No '>' found in the HTML content.") - - -def extract_page_div(html_content: str) -> str: - soup = BeautifulSoup(html_content, "html.parser") - page_divs = soup.find_all("div", class_="Page") - if len(page_divs) != 1: - logger.error( - "Expected exactly one
    element with class 'Page'. Found %d.", len(page_divs) - ) - raise ValueError("Expected exactly one
    element with class 'Page'.") - return str(page_divs[0]) - - -def fold_document_div( - html_document_start: str, html_document_end: str, html_per_page: List[str] -) -> str: - html_document = html_document_start - for page_html in html_per_page: - html_document += page_html - html_document += html_document_end - return html_document - - -def group_elements_by_page( - unstructured_elements: Sequence[elements.Element], -) -> Sequence[Sequence[elements.Element]]: - pages_dict = defaultdict(list) - - for element in unstructured_elements: - page_number = element.metadata.page_number - pages_dict[page_number].append(element) - - pages_list = list(pages_dict.values()) - return pages_list - - def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str: """Renders HTML from a JSON file with unstructured elements. @@ -91,18 +45,10 @@ def rendered_html(*, filepath: str | None = None, text: str | None = None) -> st logger.info("Rendering HTML from text.") unstructured_elements = elements_from_json(filename=filepath, text=text) - unstructured_elements_per_page = group_elements_by_page(unstructured_elements) - # parsed_ontology = unstructured_elements_to_ontology(unstructured_elements) - parsed_ontology_per_page = [ - unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page - ] - html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page] - - html_document_start = extract_document_div(html_per_page[0]) - html_document_end = "
    " - html_per_page = [extract_page_div(page) for page in html_per_page] - - return fold_document_div(html_document_start, html_document_end, html_per_page) + ontology_root = unstructured_elements_to_ontology(unstructured_elements) + html_document = ontology_root.to_html() + unescaped_html = html.unescape(html_document) + return unescaped_html def _main(): diff --git a/test_unstructured/documents/html_files/example.html b/test_unstructured/documents/html_files/example.html index 14be089463..3abd541255 100644 --- a/test_unstructured/documents/html_files/example.html +++ b/test_unstructured/documents/html_files/example.html @@ -1,41 +1,41 @@ - -
    -
    -

    + +
    +
    +

    Header

    -
    -
    -