From 349728162e59196191b939c14f9d6973447dcdf3 Mon Sep 17 00:00:00 2001 From: Sri Sudarsan Date: Fri, 21 Mar 2025 21:57:13 +0530 Subject: [PATCH 01/15] Matches prefix to verify presence of DOCX,PPTX,XLSX files instead of standard file names (#3959) Instead of looking for presence of `word/document.xml` , `ppt/presentation.xml` and `xl/workbook.xml` to identify DOCX,PPTX and XLSX files, we look for prefix `word/document*.xml`, `ppt/presentation*.xml` and `xl/workbook*.xml` as certain files generated from office365 has files with different names. Fixes https://github.com/Unstructured-IO/unstructured/issues/3937 --------- Co-authored-by: Yao You --- CHANGELOG.md | 9 +++++++++ test_unstructured/file_utils/test_filetype.py | 10 ++++++++++ .../file_type/test_document_from_office365.docx | Bin 0 -> 18752 bytes unstructured/__version__.py | 2 +- unstructured/file_utils/filetype.py | 6 +++--- 5 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 test_unstructured/testfiles/file_type/test_document_from_office365.docx diff --git a/CHANGELOG.md b/CHANGELOG.md index aa47187bdc..2fb45d5385 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.3-dev0 + +### Enhancements + +### Features + +### Fixes +- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml + ## 0.17.2 * Fix Image in a
tag is "UncategorizedText" with no .text diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 8376e4440a..ec6c805f34 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -15,6 +15,7 @@ LogCaptureFixture, Mock, example_doc_path, + input_path, patch, property_mock, ) @@ -30,6 +31,7 @@ is_in_docker = os.path.exists("/.dockerenv") + # ================================================================================================ # STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES) # ================================================================================================ @@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson(): file_buffer.name = "filename.pdf" predicted_type = detect_filetype(file=file_buffer, content_type="application/json") assert predicted_type == FileType.NDJSON + + +def test_office_files_when_document_archive_has_non_standard_prefix(): + + predicted_type = detect_filetype( + file_path=input_path("file_type/test_document_from_office365.docx") + ) + assert predicted_type == FileType.DOCX diff --git a/test_unstructured/testfiles/file_type/test_document_from_office365.docx b/test_unstructured/testfiles/file_type/test_document_from_office365.docx new file mode 100644 index 0000000000000000000000000000000000000000..fd9ca065eb8f6491cf8b1269a8558f3f7e9a2c31 GIT binary patch literal 18752 zcmeHvby!?W)92vsPSD^QEV#Qn!5xCTTW|>;JXnIeCAfQVcXxLUzLVVCH}c+<@7euh zpWXLRJguZM>LV`&1`Z4O>mUX^0S|c)0N`6g2EM52Xs_>Js{Miic+eZ^ zJAQlr$A|y(@Be@VNLW9#rj-`1Ai)ylFY zW>6NC5j{{hXLo6L#Sgn3p0V6EqZ_=_$?BLP;z7zoU~5+Et9Y~DArbuE(2Vx*jr)<@ z?dcV7+F=Aa7qkM0=BR>#BIqmjf`i;lO-uSx%C|iw8*}RUrcm#O_NcC7gIBo0m60X- z$}2MIGCwe2iogYZc|Oa^5djxuSRaJoAd>}7n>vGA-b3GSKcE+-N7e)94VxX@*^Bsu zOP$v9&~;gOE<|wsG;7onXmMPHiZ_0{$3^BBRw{6|;YUgps|UA%k)C$|mHZ(WRtkCk zV|h`3I=d=U1-e!Tw@UG|Qj&{-kH!W^;oH?D2dy>SKGIa%_Y(b`G8$qdea)S2{fCen zI5+cmxVBzj2`){w;&$h%?(qAsC?GRv9NDZYXTw+m4rd>%!7V(C#4RasUZy8@sS zoLAPcW)*+?{?zmR%>&7sr~AvVjpLz~fna8XN#tossQR_q5}ygiBZiNH;(SbAcg4Ll zkiG`pIUhIV=qo;}wabzgfpb)6%E;ZfvVFt1!VWfw^ZHm3cC9hKq#Y zppe!(NBDd+v8p1H4ugP5Blcs1xG}OKp}CWoc9B>UI@ee@I_CS#!Lr0Sk`5O8TflQO5qDg9*c151dsIn znI*JsWE(|V+mD5+^=wyt*;jq{X{uviA2$W#z7o0`E4soNk!Q$TdW*zi1zh;i?#c+a z@i9*5xq73L4U9FVQ9hMm>x0O71Af=M>(#9uY;XYJB|HFt^0($W7&|(eS(`X~Q@pBI z+Sc>z$jzZ2x1DHj#OwmvXDaSgGsgxo_C!<@>JTf!E#L_?l_%8tHbFMJRy%806I zDttk&+0C=p2fieNz^8+lI?Ixk?4`ekEqI_YR<tjp|CUss+j9W~U$_o4yn@3N^eQ+Xk74MU230bvuQHqx@Vu{W<0VQ|Bm( zK|ebAxeWFRcatVufqeH0sP8g}JWDw^OtMr}i*4yu8A`9xij=)zo8bIZvsACYwgyUk z$9zyEwt(1yW`qLeGIumxRf(eiAf~_>gh4VpoHVR5^Lfap_;>L&wFIsKjlw?qAsQ-_ zP;-Mh43bPXA)6n@1~Oh!6Xm&>aT-&U%$35?gWBa^WXIBz1t~k5mF(Q);ohE{ekI*H z7e@^W$V}pQ3NIc=e+QA>lDXU~=G!tgC+}m4yZ0jdWE?)EJflS*#dSf|NFcgkHT#Wf zY#7)V;Z*dTaW|vxu48e9vKt$<_r%5p{OEi zA~0h`&~>A&G)S+_*lpuAw^5zs>NqaFSQ@fl@aUXGhK7YDq zHPD-9WfoFk!h~$UufPRXg-)SSh8x#SmwJF_Qzv8J(&+?^VJXtFs&%=#kmu7;_D8U` zc($@5f|nC!;cs`m(%q~Xt3EXnNhzz)HGOj8M!?ZdgmeWUm$#pwOwoS2Y~6*7upZe< zhUl|$(>n~XMk-DnIYf9^DxdcsJpi6I;{wmy0v!AnX;#f>M7to9jzk)C#k)5mS6-w& zb|r8@XlI5lR5xMQlFuVdr#h9(^rEpi`xW@g&4(I9>hX7v5}DnnaR_;+r=S@YeV&@6 z3c$AvR&`!mWO!6(b(Y*f&##+1FW!GprFRj1c@YG=74WW)=E*evl(LJd<+G6G#av4G zm%#wWC};V$L%wK(AeJ_r-Lx+y=TUN}q&Q3D=Syqk1lK zKd3LpEar~)`{l9IgAXW9!{mWNmhMl;k83?i3RIJB=R;u8g~8@^LTuf!Koj@$-!r& zQqsq}uxt`i%4{!U51z+2( zAq}pNR(!3JG>FQj>ObHnzCIY{2c>U<)N0H`X<(0vCF`Bhx98G|*`U|OXZ(uM>!*$k zIZFb@=PZ?!4JWDa>3pU?k)==gR$ovk2wQCTxmpg{oEKK*;1`V`h@}{C1&GfdN)lFa zTX&)KSOE)hSN^u7oeW-DW5Kx5SYIEa7f|2k6t7-@y=VE*lZAAr53k-=Z+t(Xdm+dq z_Ts^)IPyC7;O_?nm`+AXFnwQc2$8NA@$`?cz$ z*nKvV=#BW<7j8V;o7)N(*=63?7J zBaMiB!%?#B&h%9>^3l0VZV>yT*N0SW+D4==!?StyN~x?mN_dK9jS=60WPrJEt+&UR zIA)XJ#MCr0Z?i2im09qNmfiscx zVTTH8Ck&h|h5xQ{dSh zq6BFh#cshC9ZGxDq%q92)4`BnVGHJ2)hIn@MDu8Lge!LEH4YIfV}^RVF#opfPAa)2 zQJ5ePUPxa1cOM@4A?9%i1@7t)ismZR+jwTd8cn^ zEii1OmO5U-oM(>0ArqwZ@Jp^f7gtOrnzKVGy?@K*#!(P0UQ18$(bR$gjxf6%A}%S! z{)|dlUhmCQZ6?fq$KtM#cBek}vJxS*TN;_&s)N%*%4?U07R}NbQs-z^26q(xkb@*8GwrGqmYs>&wS|p^axA6 zra5I5NuYpqkNL8abYuurcuqH6Y$?=69W^5-^n|8%J!i?kaOro-0 zdyICYZqQ90II)oG7{EQ01SoGV`1mveWC`n8($?p)39x`>xe8w#mD^Mk19&BtV34L< z1a5vf12Ci!cjVMh)c#ydYi?{OfZ+yTD37Jj(dACz_Cc61&>m~3$d5g&@C=CSM(M%) zBnctxG{DB(t=}B3xFZC|zrhU0|CNMx3T-9+ZCqJY%us z%va-Z++@vZK`V>hbV&*wR^RZK?v5zImYkxOYFr*O(_K;|^KuM(yw2`83-nCu?g}yl z6Lw#pQOc3gaos%II`jRDEpvlb2MP_y0T`{T7 zZfQHWj)t7&g^>-pLrM0M|AkC}zWZA#gmGW0*COYV7Qv!%vph%@4IPmr%)_lKB;GgcMk{tW zM!hl*Khl?q80CBOzW%g-B0`s({LTBMuQ}tA@`jE!^!bdp{ZrsQuky$;x_2lv(MUAW zh<=ciYIQ=XC0+P;^0B1*g{}`_D94W}G~tMTkfv&Nb*ZIQ_;>QLwEC4~X6LYH=Kw#5 zv;LElqw&6;(KiD8PNscRZ9P*Uf74`qNfC`E7Rh%a(!~0ORF$gfMXTxgzmZIMS~|6f zuu2(t_!YK@O0&+0VW>3bzSP6Z%Lsv4g(v%hHupv@JKQfWqWd&)36o%CNr3{ikj;e8 z*h4n4J`C@@pg6pqj44GT(Op7zsEhznXmA+o{uplMEHk%*J@;xlJEFV5%0*@_pXe~n zS1qJc3#+5BKy+_CvOp~K!urdFj8GI;uH~HD7^Cs{z<4ATzQ5SP;fAcLVcq23G-PK! zNv9fu{#BMVht)_L*Gpj&)Jf#DS!JP%ES0yPp<2v~ABBr5u8bBYOtgfm%|n*LQ;51{ zh}OxY?a;bvbB4c>Xy+@TgDZt`YS`c77+GGAfe%LO);SK!Vi*Lfi6tE+XROq~hoi*r;hx_UiKACiu<}BoSs7N*2 zTo*!C03Re~(-(_4T|Au2_DGLrn+iYInOqkN>;qqX_2&9OrwX;nsUoYkTLG&SZSUt6HMp~0ixYkE*Du-k}M?(av@J=e;t72NWE7LAn zXJJTnXH;V8Nmw}xP{oZB=<2dEecTZ8%*-}z_UC;z67lbQZMt5S5yb0;F21YUJO7kP zX~7&suDFh2Ask&r%j&nFH_fuKg*oU(%L z!1^%vf^Wa>YUE7jOPX`yvCv*hW0_L_?Bgg-L*ENmw?0}5<(XHCZd_q4B7W_20mhFU zbN07%Y3Q90ql=gm;tvyKstv3Jq`E54kgT<4mt3QDEti201y;si7^AeB*Iy3JR?%r@O~QmBYV^ zsm#+?qP#NHkm`>42LcmM9>sWENvC*M;*mM&zMWG^4CwUBOOeg@v_j z+i;Khsxl5T?&f}3pObRwpuiO^*Y#X3+qGG^IK$OVQRy^S@swQcbWA0ksHEf;O4wIn znv&tRLllIEVh#tQI9;LDPXwgM1>F_jC@Hr3A(#SML{m~kQ%^*5IWTjO0{ubEHY9DY z(xY%Zs9>Bol@shH7zarxG8fh=2iB?__ApU-HJe^glTL8IPWUTm<1u>5xRNEJwB?YC zGKP6*zd~p~NLc?3L8o!c9ImXAw6z>#9Rp)sS@xEF?)`H0=*{ayi`ZpwlxIku`_JPx zspB{O;y0VKKONM6nrOGz$)0kD9C9hNTu?GLOzP`?p=^w=p<Fp}V5kI#xkiy6v1g9S<9kZv{j1;7^nV#Nrhck)Sjg#Whk!WaaoA2T1anvxT#K58~ zu&!@cDy}Sur^BUr1FqjMUEO%Q$Eh_eXW4MEf#p_+1YvbPe;*?+SNC$3S#vg)yD^yx z{uJak5=3I5h%jP0${Skxc7Z0CxK*_K?qk^z79UJnjt zl$9)FUgr&b2*cQIPxhF3H%)9%T08=EoxXk;xHOzLY5QO)SE7s|7D)IOB4oago#pD@Lr??FnF_7LkwW6k?r_;^`FPZgAn!UKU9Ao35P5+4gKow6dfqPM+o*47YqN@xf4W3G9>`NrJ z_GpYMCVq?cSu5IW)%NF|^3~2-)2gqBPpDv>v`=iM-uOrNl`b<0V^H;S8r_WEDpjVt zzJJdhvcrqr!sAGcpETNq641sB<7AxQe?T#(FvnZyLJ{85y7}aD3c|m)9~@SN>4%%&7SE)iy8i0d3;qd#eBEl8V_P0>%vl03h`Y0KoWL zXpW}FR>m*BKQewx10JetmyNSxKhnH9lHyIyhROW+rX3Orc{}{A6q_BILn?ddqJwN; zG~k@7qy{yj)^Z77=Z$$Ry|0dk)uxFO&lOnIS|-;}pom6fp@KVJ>c7E=40XS8%L>zRI7j5Owks?gGyXp0}-8cRNCDZkML2 z%kI1@wQvlQy2s5`eKpt4-f(yo2HRu_H{+~H3{y_@1?SkD5Vk`D;(Tov=|ws(WZrdp>eAiK z#pA=c2Va3{q8_e_*>KCz^T%GJ&Pf!7&n+h^SRo;pUQykS z=Eo?H+KX&_AGVPptdqJ}`gSSm9On!5#wL3K@iirxS8@@BvpS;cX%}q-ZuhW>$_fH> zR9puVs~}IZvB863&Zt1Rxnda_iWhbUx}E&Rs8E1%qbBzBeM1emRVy9*EpidQgAaw) zV0{1@%0~`MXso}hsG9&GN9Ev9E6}%Nl+Rz<=man$qE(qEMN?nEZd54lJaQwM zN1AAX^9Tm`DTH7v#guM#6;EgP%abx)!p@MKXCn@B@G1+I@kRIj#Y1xW=f)=ZAuR!PdtW_t5Tyi|h@m{c!q0EN# z9z3tC=kdv73QOW@mJMmS!gIO5!adCD^^NGtYiq(0?#^RTbHa9YcYKWbZLinfA7+MAo@u#PPAv4p8vy1|-MWTh%i;B6!tzms=2iJ|)xGb$4Z@;;2|EEI-Z0_vWa~Td z`KGH@XEB}rm%xDg5I4|s*v84Se{VJ|o$Zqcs8;TmSf&qgLu$7d_shReu=YGzFN%Q+ zm8rnhcvJx1UlS~F4c^Jh*xHfl+eAT>i4CCq_-%Ii_ai4>!g`(ooxe=b^n>`B7P0II zxy*_PNY-v4S8}HC^Hlz^gvBDKQIbp8hY39z1Y|Wvz1aKbPj_g+EiF$A3x_%UT;}-G z<~TXi5&`+oIkyY7L^XA_=$>zIzv95u7&UhMNMGO3RPaeFwj{fc1VEXn5=2X~)zrH- zgCa`iJRO5pltzF5a+GOGqzxv5#_HTPE?a_!=@r@@o?62f^I+|XwGGW=R^>LMjFL_Q zVa;yT`NWbs(E7G?o7$B}z=6?}K1-*b%GC4HNJNPEV!Ql9P6Ut0MbXWSL4GU7YGofJ z4UX{Wy%?wHMGt|4?42nU@-(ei{!n2wJF=|~3&YRIsBnjuDRO&pFknKmx;f|@ zt7qMwGpbT_5K79tsM~eK*1OVslAS%)jVtgX;6D>fVpGK z6|~54D~KabRx{YZCE9Tr3?&_j#3`fe%8cxPt6r;h&feHCF$1`DECoT@e z`WeF@d=DAgPKLE?G0Vt|5%B6*(%C}gxwQx%(Drviw&wD+3G&(#BZ4|Ysz|#8PQ+FI zg2Jm-vpe_5?n*Q!keZEtg0d~L+hI4`yE#}o`O*kij>llGyo6B%gn39+c3~^Wu8&?g z5zVwmv8R+~&1H0<9j(icdJDu~^*72{{4)K7)?c1YxH)>_v>lf>D0JlurYl@HREb7E zzOHMJS?UDA($K6SKfxz^gTNHBdH&)A@~D;N0bQqx+J7zYa~HPGCiQrN|9fMj%1cZK z!k3Tl&z>O)xFTG;aK3MiMfB8RQk#4G?64_lx*!&U&i!-vT57u#d+>+=ZRHhD#lo!y zct*8YrR~Wqj`YWa%gkG@jn&>DdlNsJ`mJ=I^L2G*gf?fp_HrtA`1*y#N3d^t|9d$E z3ec(i*8V7mVEj)x#J`X9@2kwei*FJ5sc-EM@iG5DFTTGlDCCa6-ly6NEuY!xN)k+z|$Raj7qy&#oBSm}s68TuGtweh6wbd^$ zL>vumaAZl4`(LRvFIlr)Ir2HJUSFJhy78pdu}yCiE^clbxHT;dS-(o0pKV{P50eYa zJy@Qbn@Xg^NP>Dh;90kdE@4hglRJz_b&$jozi61?lQz2#3Td1oh1q$Tn53mM@Y-mL zF4wgAxl|>4UeT_nowcTqChe4(^+8%(@0^`*P-v1c$d)vNt}H_s-qsyR?vN!XkIQpv z6aiEY45G?R=m>>4+w$~&pHS(WsyOZwMTtPK%cH1k4}qyVl?37Ycaj)r?}*?Lk=W`N zXu!glA)BSr#|}o3OvMBIB3+9(h#%W2khN@_VNl4&Cq3`8wABsFxy;bKVVpU;txNBXAIJ{fpXqR#qrRTo3T%~(4 z94(b#(tp!s>Ap(;YWs6M2u}!VcMe9P?KP^4@$CIAx+%&rbb)*G^& zGIZ)`?rEo;`xYTa_kp-)hDbYt`uUUQFH>)yX+j)D#sC(T9>LALR7qKk_>sLk*f){W z3Kq}SKc1M7ZWN&}D0QA5HBk8~yz)GM%u%y02;jqcp%KQO-KVgrOceXj0HSqO>LZnr z$fKGyrc1d`$_c8Cu|Shk-2QPmTugH>izOiQAwHAA$B)cd;Xw{VJoA&5+_WB-1U?K7 z;%E;XW7`8PXNXq5y*2Ef zRHQQ0V6?;~&UCU6;k3o%)tkg1n1X%9Eld*U^f6cpGR5j9g|rPQW{x=iedgHIbT7f8 z&3YesckoOzAc?W^SE4&P@+|<;P@CR_`J7N9FiBkSrc4`K%?x+G@($3Fiikq`Rz8BSjjfHQrmO6X7MKOI@Rde%u$00JzcBim z_dd?Mz41k^oW(_WjXnAyLO4ho-VJR?pl~b5Wh?1stF$7)YjM;A^hHJXqfV#3s*zcm z^?K@PFGd=ia6ZF<)q>J50Ug{1A+NWki>)Wi=~SNr_2UTlLZyidupn)vH(AaL7}`Pf z0?*;N1KRjpa^Xxk9iY=&11LGnUeHu_*i?As7Fi2T6s|k=vNd>;joM(>^Dmt~}&4;iLND5SDar_9u zv0s3n5oKw!X-XaQF-i0ux;r@%0S%GBTv>cn*y%!%G-v#-3f?GxsA*D3Zt<{>{^q5K zRbsI85_l!Zkw*4x!Nv|`T6=aQI8H7R)D?x7+v`-8D1L-vjeO^1w9K!Jfm1x)nECFv zJzx73W}O2xt2fdyrnzjg!pHOY7vCdHyzXO)d_Ulx{3dG{{WfRvbYC9}dLJf`s>){t z3ii2vFMib;%Nb=&k`2rS_%U`(tg6(hO92qi1n|Ygsi~Pjt7%UDN;}@+D4TPqb6(tDesb#RpBK=<8LXf|+Btc)%45YrgE6+YNscKD-|yofrd^X-qScojWMW3*istuoLkxe7%j zmXK$Z!jq|rG3h||r5ba0)#@{}{OB|441yBLF84+TaF0d?VPFdYwoZ@Q6aRWe9G>jB z3%YyStaoN4CKDm|%#h($Dv|=MDnZlg)z=cdmyjZ_=Gf_*Lod?e1`b$UGKMmqmhe1H@(0{6$bJCxmH5J-`C_`xy?}$5_!~I$`C&wC&u4qd+ zF_8XTsrd%#-9((VpD$zgmsbTVq8F^iH@Q-A?UG*B4x!1nKc#*a|#f6dRZjO0RWXF!H1^Baq$+ZkD7@r6~`1np)I z%_*KfqbFP@fD-VHb5DOw;7r=+gf}UMFL{>MhGv4xoikgQI2g7?8T?vN#sgp%OF=^i zk*MxEF~#UB&3e~KOwH|lQe{@vrlviHec-RG%gEEfH95hd^?A@(g7}_@z(!!2g3u4^ zN$U%6iJHrL3HVd~z32C`(jY2;@B9yX{=f8V5hB_`fjuvB4fZJHdY9x_EJH1-U+i~> zlVl~XTQ$;1A%iY(@AA2R!DA);@DYdNw3Q=58nG}3(igp!aCdFx_?nb1B#!6uje#Ik zXZbQ+bG6sp1@B#)Tem|uWgkNVw+o_*8qp64oZVYd>2^?fvs$WxQP91S$VHqR2!9$aB!I&}@CXZq$A3g|ns zd}YV^dU#!gK#BPbszF{D%*ANN`2*o03KorV=V!TMSh7#P>ZD#4gVx=8b(sMbEF_zH zIjL@Az4f2d^^HMXDNchZyg7_fVSGe9Pnl^9!0#@XG2x$F~t6m%#n8_MO`12-v@ z7z>xSV7h#;;Jp=;FvxdGC*_`;kbduftptka|Fi!8J|6zl|G%jG&;EZ5W^hT_6EHH{ zGe&8A>)2)?o@X^fj>eR=)57MeA#lQeeA*mk=3K*2MDzLz9ig}o7hP1WDJsi%*}7%H z&!D2BhFOHgRxC)n;j6hUyCxM$x~G8(bSH1dl)80RLe2>vc#g^o}e`knGY~RkWbIiJD8Re6 zorF)R==)hedpOg*`-f254LF{^wLd~_mjCBN?O%HI|GKAt4b#8(G0mAhp(6pi=oDCc zkN!K;!VR> z4|1hb3j``QMyje&6rD;zTV;)gFPn~i`9HT0${vYJX&YzTHlBXd&|~H2LBvMYAEI9iA9Qe zV*(Z(jTMv|Wtc#9$4H;UBx@BNHP=)kAF{g44LPoB_60XJ#siptn8%yv#_Z>qT76Cp zH0j%2S;kMW;u7&1(|s+Ucs+wLl&}AoHn&cddZ2+?67yYCph5n%ukbJb=`T(BkFWI2 zs-}$$TKhA=16J;`n}p)HB4T3@k>x?mau%I;oW+PM#YqK7jZ1Qy?i3I6t``kvA1x`C zbuL0MUpBNQgN|e9WrtTT>P14l;W@|xC)w52z_gCARU zCH7iLG$fWHN3#aj42`KN&1qkVo$&49odC7|raH|!4A=~^oAc71>FxSSO(NZ6i;>Bu z71y-&iMkKgr8iGCda^1IrBe%G~CA>1c%r5Dd#(F)3^qh;sy zVqxAZTrAt1WS5tJaf`b4lK$KjtiH(bInPv2eexp%=bjfHLTbaqHXG3I_TW+y217EM zsU{5D3iuq)wAM@UDiC$Hl{AJ`<8umwS}CPI?I1sg8kGrw)2AZ?v5BI3o&EbhpR0%h z#3o-=PK(0S{hO0nb~C7qU(w^~tsi>-TQK*fEc~0d~h4uv_4O|NnBazK_9@7#S&G zAz09{-xKjb6W6?oaCSpe$Vf`hEm)Az6&M>Ksgl)GQw5Pmz6BD`>$9O)lc3@$F0KNX zNVAL5AY%nTgS;jjZQTxwyUJRq_<+=mUanI8VVJ4G%gMb!ikOACAV$)7D*8!=kN~L8 zY^druSi+gjlp!&xn5c>`{I?PA-G#Kr43Dq*9XgetDG4nkJ(EA=J>jricFNikE2~LJ zOLYZxqd0(15ecc|>r2X0pzBJ{fn2|Ipr^=24h@Iowo*>Bxwx)0Rqu&LHYapj+NNQo zsBGK_jYQ*PCFI2&kPNnendjgkf!`#vcF1?fVbmOdvSY&f;irUxL~S)#D>%HJfr?8B z_02~D-p@b?xbGOany;(uW^3%A{cYxJitqpCV~Ti_onew$hXo+-8G=}5plX*prAb(< zCLZkW&#RqC+=Ir(VFm~)O~lENnrtmO@PwvaPuDz}S~-3SK#uBP78EwcOl z`13Xte{uh;^853~2|u~t0P+51t$y4_;h(jqpX`5bPW;J^0W=AJvHxqU;-AcamX!WvPR04> z0@FVc{;aS3i7<-$1L4nV%ReE0FT42(002A?{ebu_YW$1;ch&i|eBmd;CCP8y_q~|m zpSXWrz5mJ0LiQW?_qF`Lul?`L{O=3iKY0NFK61bxdHU}u`ac1FPwoE-kU;k*z|Tbg zpD4fQ!+%A|qyH1-dw%RsfZy}XzXC+R_!Hp!W&9(`@9EiJQ9dyIit FileType | None: filenames = zip.namelist() - if "word/document.xml" in filenames: + if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames): return FileType.DOCX - if "xl/workbook.xml" in filenames: + if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames): return FileType.XLSX - if "ppt/presentation.xml" in filenames: + if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames): return FileType.PPTX # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root -- From 347a4e5d9ee42f32c1186f0f0dada93bf9910778 Mon Sep 17 00:00:00 2001 From: luke-kucing Date: Tue, 25 Mar 2025 15:38:47 -0400 Subject: [PATCH 02/15] =?UTF-8?q?manual=20trigger=20of=20workflows=20to=20?= =?UTF-8?q?publish=20new=20image=20and=20new=20vers=20tag=20in=20=E2=80=A6?= =?UTF-8?q?=20(#3965)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …quay There were some open CVEs in the base-image. Those are resolved so triggering a workflow with updated version tag --- CHANGELOG.md | 10 ++++++++++ unstructured/__version__.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fb45d5385..20c8650e10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.3 + +### Enhancements + +### Features + +### Fixes +- Resolve open CVEs + ## 0.17.3-dev0 ### Enhancements @@ -102,6 +111,7 @@ ### Fixes - **Fix file type detection for NDJSON files** NDJSON files were being detected as JSON due to having the same mime-type. +- Base-image was updated to resolved CVEs, running pipline to manually build ## 0.16.20 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 433383a01d..af66a65e41 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.3-dev0" # pragma: no cover +__version__ = "0.17.3" # pragma: no cover From 3f07840b80a1157ee64af23302f5bb48dfc5e404 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Wed, 26 Mar 2025 18:37:03 -0500 Subject: [PATCH 03/15] chore: deprecate stage_for_label_studio (#3968) This PR is to address [a CVE](https://github.com/advisories/GHSA-rgv9-w7jp-m23g) that appeared in a recent scan. The CVE has to do with the package `label_studio_sdk`. This relates to the tool Label Studio, a data labeling platform. We built a staging function that takes a list of elements and converts it to a format suitable for passing to the LabelStudio platform. We don't use the package with the vulnerability in the actual function, we only use it to test the output of the function against the Label Studio API schema. Even the test where we use it is sort of questionable in value, since it's really testing the schema against an old version of the LabelStudio API (we are testing against a recording of the Label Studio API's responses stored using `vcrpy`). Label Studio has fixed the vulnerability as of version 1.0.10 of their SDK, but we're stuck on 1.0.5 because 1.0.6 and above require `numpy<2.0.0`. This leaves us with several choices of resolution, some of which are: 1. Downgrade `numpy` to upgrade `label_studio_sdk` to >=1.0.10 to resolve the CVE 2. Drop `label_studio_sdk` by either removing or rewriting the test. 3. Drop test and dev dependencies from the `unstructured` image. We've decided to do 2. _and_ 3. This PR handles 2., with 3. to be a follow-on PR. Here we add a deprecation notice to `stage_for_label_studio` and remove the offending test. Normally good practice would be to add a warning of future deprecation to the function for a reasonable amount of time, but in order to address the CVE immediately, we're deprecating it right away. ### Testing Install the dependencies (`make install`) into a fresh environment, and `pip list | grep label` should have no results. The scan artifact in CI should contain no "high" or "critical" CVEs. --- CHANGELOG.md | 9 + requirements/base.txt | 2 +- requirements/dev.txt | 8 +- requirements/extra-csv.txt | 4 +- requirements/extra-docx.txt | 2 +- requirements/extra-odt.txt | 2 +- requirements/extra-paddleocr.txt | 6 +- requirements/extra-pdf-image.txt | 12 +- requirements/extra-pptx.txt | 2 +- requirements/extra-xlsx.txt | 4 +- requirements/huggingface.txt | 4 +- requirements/test.in | 2 - requirements/test.txt | 138 +----- .../staging/test_label_studio.py | 61 --- .../cassettes/label_studio_upload.yaml | 414 ------------------ unstructured/__version__.py | 2 +- unstructured/staging/label_studio.py | 6 + 17 files changed, 45 insertions(+), 633 deletions(-) delete mode 100644 test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 20c8650e10..875f098612 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.4 + +### Enhancements + +### Features + +### Fixes +- **Deprecate `stage_for_label_studio` and drop `label_studio_sdk` dependency.** This resolves a CVE due to the dependency on `label_studio_sdk`. + ## 0.17.3 ### Enhancements diff --git a/requirements/base.txt b/requirements/base.txt index 17a25c4d40..78fc8ce871 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -125,7 +125,7 @@ tqdm==4.67.1 # via # -r ./base.in # nltk -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -r ./base.in # anyio diff --git a/requirements/dev.txt b/requirements/dev.txt index 0de6c4eb02..4b489656fb 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -32,20 +32,18 @@ packaging==24.2 # build pip-tools==7.4.1 # via -r ./dev.in -platformdirs==4.3.6 +platformdirs==4.3.7 # via # -c ./test.txt # virtualenv -pre-commit==4.1.0 +pre-commit==4.2.0 # via -r ./dev.in pyproject-hooks==1.2.0 # via # build # pip-tools pyyaml==6.0.2 - # via - # -c ./test.txt - # pre-commit + # via pre-commit tomli==2.2.1 # via # -c ./test.txt diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index a5779f0a87..51885ae7ad 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -14,11 +14,11 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # pandas -pytz==2025.1 +pytz==2025.2 # via pandas six==1.17.0 # via # -c ./base.txt # python-dateutil -tzdata==2025.1 +tzdata==2025.2 # via pandas diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 7cdf55c7a7..b6a9158f4f 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -10,7 +10,7 @@ lxml==5.3.1 # python-docx python-docx==1.1.2 # via -r ./extra-docx.in -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index a157708ebd..fa8e746301 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -12,7 +12,7 @@ pypandoc==1.15 # via -r ./extra-odt.in python-docx==1.1.2 # via -r ./extra-odt.in -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index a5264d7840..84afee5161 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -107,7 +107,7 @@ packaging==24.2 # -c ./base.txt # lazy-loader # scikit-image -paddlepaddle==3.0.0rc1 +paddlepaddle==3.0.0 # via -r ./extra-paddleocr.in pillow==11.1.0 # via @@ -115,7 +115,7 @@ pillow==11.1.0 # paddlepaddle # scikit-image # unstructured-paddleocr -protobuf==6.30.1 +protobuf==6.30.2 # via # -c ././deps/constraints.txt # paddlepaddle @@ -167,7 +167,7 @@ tqdm==4.67.1 # via # -c ./base.txt # unstructured-paddleocr -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt # albucore diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 0226cee3e6..061fb6de3b 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -158,7 +158,7 @@ proto-plus==1.26.1 # via # google-api-core # google-cloud-vision -protobuf==6.30.1 +protobuf==6.30.2 # via # -c ././deps/constraints.txt # google-api-core @@ -180,7 +180,7 @@ pycparser==2.22 # via # -c ./base.txt # cffi -pyparsing==3.2.1 +pyparsing==3.2.3 # via matplotlib pypdf==5.4.0 # via @@ -195,7 +195,7 @@ python-dateutil==2.9.0.post0 # pandas python-multipart==0.0.20 # via unstructured-inference -pytz==2025.1 +pytz==2025.2 # via pandas pyyaml==6.0.2 # via @@ -256,15 +256,15 @@ tqdm==4.67.1 # -c ./base.txt # huggingface-hub # transformers -transformers==4.49.0 +transformers==4.50.1 # via unstructured-inference -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt # huggingface-hub # pypdf # torch -tzdata==2025.1 +tzdata==2025.2 # via pandas unstructured-inference==0.8.10 # via -r ./extra-pdf-image.in diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 41b37f70f0..30e77d1ce7 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -10,7 +10,7 @@ pillow==11.1.0 # via python-pptx python-pptx==1.0.2 # via -r ./extra-pptx.in -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via python-pptx xlsxwriter==3.2.2 # via python-pptx diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 895935708c..937191502d 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -20,13 +20,13 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # pandas -pytz==2025.1 +pytz==2025.2 # via pandas six==1.17.0 # via # -c ./base.txt # python-dateutil -tzdata==2025.1 +tzdata==2025.2 # via pandas xlrd==2.0.1 # via -r ./extra-xlsx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 829a0448d4..f9e62f5266 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -96,9 +96,9 @@ tqdm==4.67.1 # huggingface-hub # sacremoses # transformers -transformers==4.49.0 +transformers==4.50.1 # via -r ./huggingface.in -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt # huggingface-hub diff --git a/requirements/test.in b/requirements/test.in index ca9d2d5bfe..e9b8fadbf8 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -6,7 +6,6 @@ types-click flake8 flake8-print freezegun -label_studio_sdk mypy pydantic pytest-cov @@ -15,7 +14,6 @@ ruff types-Markdown types-requests types-tabulate -vcrpy grpcio autoflake liccheck diff --git a/requirements/test.txt b/requirements/test.txt index b64b5d52f5..1ebccc8953 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,41 +6,21 @@ # annotated-types==0.7.0 # via pydantic -anyio==4.9.0 - # via - # -c ./base.txt - # httpx -appdirs==1.4.4 - # via label-studio-sdk -attrs==25.3.0 - # via jsonschema autoflake==2.3.1 # via -r ./test.in black==25.1.0 # via -r ./test.in -certifi==2025.1.31 - # via - # -c ./base.txt - # httpcore - # httpx - # requests -charset-normalizer==3.4.1 - # via - # -c ./base.txt - # requests click==8.1.8 # via # -c ./base.txt # black - # nltk -coverage[toml]==7.7.0 +coverage[toml]==7.7.1 # via # -r ./test.in # pytest-cov exceptiongroup==1.2.2 # via # -c ./base.txt - # anyio # pytest flake8==7.1.2 # via @@ -54,47 +34,12 @@ grpcio==1.71.0 # via # -c ././deps/constraints.txt # -r ./test.in -h11==0.14.0 - # via - # -c ./base.txt - # httpcore -httpcore==1.0.7 - # via - # -c ./base.txt - # httpx -httpx==0.28.1 - # via - # -c ./base.txt - # label-studio-sdk -idna==3.10 - # via - # -c ./base.txt - # anyio - # httpx - # requests - # yarl -ijson==3.3.0 - # via label-studio-sdk -iniconfig==2.0.0 +iniconfig==2.1.0 # via pytest -joblib==1.4.2 - # via - # -c ./base.txt - # nltk -jsonschema==3.2.0 - # via label-studio-sdk -label-studio-sdk==1.0.5 - # via -r ./test.in liccheck==0.9.2 # via -r ./test.in -lxml==5.3.1 - # via - # -c ./base.txt - # label-studio-sdk mccabe==0.7.0 # via flake8 -multidict==6.2.0 - # via yarl mypy==1.15.0 # via -r ./test.in mypy-extensions==1.0.0 @@ -102,47 +47,29 @@ mypy-extensions==1.0.0 # -c ./base.txt # black # mypy -nltk==3.9.1 - # via - # -c ./base.txt - # label-studio-sdk -numpy==2.0.2 - # via - # -c ./base.txt - # pandas packaging==24.2 # via # -c ./base.txt # black # pytest -pandas==2.2.3 - # via label-studio-sdk pathspec==0.12.1 # via black -pillow==11.1.0 - # via label-studio-sdk -platformdirs==4.3.6 +platformdirs==4.3.7 # via black pluggy==1.5.0 # via pytest -propcache==0.3.0 - # via yarl pycodestyle==2.12.1 # via # flake8 # flake8-print pydantic==2.10.6 - # via - # -r ./test.in - # label-studio-sdk + # via -r ./test.in pydantic-core==2.27.2 # via pydantic pyflakes==3.2.0 # via # autoflake # flake8 -pyrsistent==0.20.0 - # via jsonschema pytest==8.3.5 # via # pytest-cov @@ -155,35 +82,14 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # freezegun - # pandas -pytz==2025.1 - # via pandas -pyyaml==6.0.2 - # via vcrpy -regex==2024.11.6 - # via - # -c ./base.txt - # nltk -requests==2.32.3 - # via - # -c ./base.txt - # label-studio-sdk - # requests-mock -requests-mock==1.12.1 - # via label-studio-sdk -ruff==0.11.0 +ruff==0.11.2 # via -r ./test.in semantic-version==2.10.0 # via liccheck six==1.17.0 # via # -c ./base.txt - # jsonschema # python-dateutil -sniffio==1.3.1 - # via - # -c ./base.txt - # anyio toml==0.10.2 # via liccheck tomli==2.2.1 @@ -193,13 +99,9 @@ tomli==2.2.1 # coverage # mypy # pytest -tqdm==4.67.1 - # via - # -c ./base.txt - # nltk types-click==7.1.8 # via -r ./test.in -types-markdown==3.7.0.20241204 +types-markdown==3.7.0.20250322 # via -r ./test.in types-requests==2.31.0.6 # via -r ./test.in @@ -207,36 +109,10 @@ types-tabulate==0.9.0.20241207 # via -r ./test.in types-urllib3==1.26.25.14 # via types-requests -typing-extensions==4.12.2 +typing-extensions==4.13.0 # via # -c ./base.txt - # anyio # black - # label-studio-sdk - # multidict # mypy # pydantic # pydantic-core -tzdata==2025.1 - # via pandas -ujson==5.10.0 - # via label-studio-sdk -urllib3==1.26.20 - # via - # -c ././deps/constraints.txt - # -c ./base.txt - # requests - # vcrpy -vcrpy==7.0.0 - # via -r ./test.in -wrapt==1.17.2 - # via - # -c ./base.txt - # vcrpy -xmljson==0.2.1 - # via label-studio-sdk -yarl==1.18.3 - # via vcrpy - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/test_unstructured/staging/test_label_studio.py b/test_unstructured/staging/test_label_studio.py index 6d3be972b7..11ca79d064 100644 --- a/test_unstructured/staging/test_label_studio.py +++ b/test_unstructured/staging/test_label_studio.py @@ -1,11 +1,6 @@ from __future__ import annotations -import logging -import re - import pytest -import vcr -from label_studio_sdk import Client from test_unstructured.unit_utils import assign_hash_ids from unstructured.documents.elements import Element, NarrativeText, Title @@ -17,62 +12,6 @@ def elements(): return [Title(text="Title 1"), NarrativeText(text="Narrative 1")] -@vcr.use_cassette( - "test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml", - allow_playback_repeats=True, -) -def test_upload_label_studio_data_with_sdk( - caplog: pytest.LogCaptureFixture, elements: list[Element] -): - """ - Testing Instructions - ==================== - 1. Remove file `test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml`, - which will be recreated later. - 2. Install the label-studio package by running command `pip install -U label-studio`. - 3. Run command `label-studio`, and login or set up label studio account on pop-up website. - 4. Update `LABEL_STUDIO_URL` and `API_KEY` below, you can find your API_KEY by - clicking into your account profile. - 5. Run this test once, and VCR will record the HTTP request to the yaml file. - 6. Kill the label studio instance and run the test again, VCR will replay the response. - """ - log = logging.getLogger("urllib3") - log.setLevel(logging.DEBUG) - # Define the URL where Label Studio is accessible - LABEL_STUDIO_URL = "http://localhost:8080" - # API_KEY is a temporary key from local install not actually valid anywhere - # Update it if the vcr cassette is updated with the API key from your user account - API_KEY = "7b613506d5afa062fe33c9cd825f106c718b82a0" - # Connect to the Label Studio API and check the connection - ls = Client(url=LABEL_STUDIO_URL, api_key=API_KEY) - ls.check_connection() - ls.delete_all_projects() - # Create a sample project to classify types of texts - project = ls.start_project( - title="Text Type Classifications", - label_config=""" - - - -
- - - - - - - """, - ) - label_studio_data = label_studio.stage_for_label_studio(elements) - project.import_tasks(label_studio_data) - # Check success status code (201) for posting tasks job in logger info - success_posting_tasks_status = re.compile(r"POST /api/projects/.*/import.*201") - assert bool(success_posting_tasks_status.search(caplog.text)) - - def test_convert_to_label_studio_data(elements: list[Element]): label_studio_data = label_studio.stage_for_label_studio(elements) diff --git a/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml b/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml deleted file mode 100644 index bf4f22255c..0000000000 --- a/test_unstructured/vcr_fixtures/cassettes/label_studio_upload.yaml +++ /dev/null @@ -1,414 +0,0 @@ -interactions: -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - User-Agent: - - python-requests/2.28.0 - method: GET - uri: http://localhost:8080/api/version - response: - body: - string: '{"release": "1.7.3", "label-studio-os-package": {"version": "1.7.3", - "short_version": "1.7", "latest_version_from_pypi": "1.7.3", "latest_version_upload_time": - "2023-04-19T12:05:18", "current_version_is_outdated": false}, "label-studio-os-backend": - {"message": "Merge pull request #2612 from laggardkernel/bugfix/realpath-in-version - ...", "commit": "fcd7806529ea60cf5e56c782345ced04659d018d", "date": "2023/02/06 - 20:09:22", "branch": "master", "version": "2.3.12+10.gfcd78065"}, "label-studio-frontend": - {"message": "fix: LSDV-4692: Brush segmentation is not supported", "commit": - "f08871a3e70026b12cad502552251db1fba1619e", "branch": "master", "date": "2023/03/29 - 14:40:33"}, "dm2": {"message": "fix: LSDV-4746-1: Only include limited fields - for project when polling", "commit": "9aa96a97e9bcb4154838249dc721efbc724198b7", - "branch": "master", "date": "2023/03/13 15:43:21"}, "label-studio-converter": - {"version": "0.0.51"}}' - headers: - Content-Language: - - en-us - Content-Length: - - '924' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:17:59 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgF:YW6N1NblXlgyM_81UyYNBkcxIWjokDRdWetCeeQfDgA; - expires=Thu, 15 Jun 2023 21:17:59 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 200 - message: OK -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgF:YW6N1NblXlgyM_81UyYNBkcxIWjokDRdWetCeeQfDgA - User-Agent: - - python-requests/2.28.0 - method: GET - uri: http://localhost:8080/health - response: - body: - string: '{"status": "UP"}' - headers: - Content-Language: - - en-us - Content-Length: - - '16' - Content-Type: - - text/html; charset=utf-8 - Date: - - Thu, 01 Jun 2023 21:18:00 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgG:wG1DT2Iz8ZHJlPxwIMum_NVMweQyXE7bbbbiX0tNCuQ; - expires=Thu, 15 Jun 2023 21:18:00 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 200 - message: OK -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgG:wG1DT2Iz8ZHJlPxwIMum_NVMweQyXE7bbbbiX0tNCuQ - User-Agent: - - python-requests/2.28.0 - method: GET - uri: http://localhost:8080/api/projects?page_size=10000000 - response: - body: - string: '{"count":1,"next":null,"previous":null,"results":[{"id":23,"title":"Text - Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T18:31:12.795409Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential - sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":0}]}' - headers: - Allow: - - GET, POST, HEAD, OPTIONS - Content-Language: - - en-us - Content-Length: - - '2033' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:18:01 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU; - expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 200 - message: OK -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU - User-Agent: - - python-requests/2.28.0 - method: GET - uri: http://localhost:8080/api/projects/23 - response: - body: - string: '{"id":23,"title":"Text Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T18:31:12.795409Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential - sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":0}' - headers: - Allow: - - GET, PUT, PATCH, DELETE, HEAD, OPTIONS - Content-Language: - - en-us - Content-Length: - - '1981' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:18:01 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU; - expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 200 - message: OK -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU - User-Agent: - - python-requests/2.28.0 - method: GET - uri: http://localhost:8080/api/projects/23 - response: - body: - string: '{"id":23,"title":"Text Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T18:31:12.795409Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":0,"task_number":2,"useful_annotation_number":0,"ground_truth_number":0,"skipped_annotations_number":0,"total_annotations_number":0,"total_predictions_number":0,"sampling":"Sequential - sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":0}' - headers: - Allow: - - GET, PUT, PATCH, DELETE, HEAD, OPTIONS - Content-Language: - - en-us - Content-Length: - - '1981' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:18:01 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU; - expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 200 - message: OK -- request: - body: null - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Content-Length: - - '0' - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU - User-Agent: - - python-requests/2.28.0 - method: DELETE - uri: http://localhost:8080/api/projects/23/ - response: - body: - string: '' - headers: - Allow: - - GET, PUT, PATCH, DELETE, HEAD, OPTIONS - Content-Language: - - en-us - Content-Length: - - '0' - Date: - - Thu, 01 Jun 2023 21:18:01 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU; - expires=Thu, 15 Jun 2023 21:18:01 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 204 - message: No Content -- request: - body: '{"title": "Text Type Classifications", "label_config": "\n \n \n \n
\n \n \n \n \n \n \n "}' - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Content-Length: - - '591' - Content-Type: - - application/json - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgH:WtvRIVQBmnyfp8bWanOG78K14WIsHWPSqq2yt6C8FYU - User-Agent: - - python-requests/2.28.0 - method: POST - uri: http://localhost:8080/api/projects - response: - body: - string: '{"id":24,"title":"Text Type Classifications","description":"","label_config":"\n \n \n
\n \n \n \n \n \n ","expert_instruction":"","show_instruction":false,"show_skip_button":true,"enable_empty_annotation":true,"show_annotation_history":false,"organization":1,"color":"#FFFFFF","maximum_annotations":1,"is_published":false,"model_version":"","is_draft":false,"created_by":{"id":2,"first_name":"","last_name":"","email":"johnjennings.tutor@gmail.com","avatar":null},"created_at":"2023-06-01T21:18:01.964955Z","min_annotations_to_start_training":0,"start_training_on_annotation_update":false,"show_collab_predictions":true,"num_tasks_with_annotations":null,"task_number":null,"useful_annotation_number":null,"ground_truth_number":null,"skipped_annotations_number":null,"total_annotations_number":null,"total_predictions_number":null,"sampling":"Sequential - sampling","show_ground_truth_first":false,"show_overlap_first":false,"overlap_cohort_percentage":100,"task_data_login":null,"task_data_password":null,"control_weights":{"type":{"overall":1.0,"type":"Choices","labels":{"Title":1.0,"Narrative":1.0}}},"parsed_label_config":{"type":{"type":"Choices","to_name":["text"],"inputs":[{"type":"Text","value":"text"}],"labels":["Title","Narrative"],"labels_attrs":{"Title":{"value":"Title"},"Narrative":{"value":"Narrative"}}}},"evaluate_predictions_automatically":false,"config_has_control_tags":true,"skip_queue":"REQUEUE_FOR_OTHERS","reveal_preannotations_interactively":false,"pinned_at":null,"finished_task_number":null}' - headers: - Allow: - - GET, POST, HEAD, OPTIONS - Content-Language: - - en-us - Content-Length: - - '2005' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:18:02 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgI:Y2nBj16y8Buj0irVpJeFn0fNguq_rXv9BmdK5o64fsw; - expires=Thu, 15 Jun 2023 21:18:02 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 201 - message: Created -- request: - body: '[{"data": {"text": "Title 1", "ref_id": "ab03af41c2940e7584b62df48a964db3"}}, - {"data": {"text": "Narrative 1", "ref_id": "ff9eb806beb1f483322f6fbda680b08b"}}]' - headers: - Accept: - - '*/*' - Accept-Encoding: - - gzip, deflate - Authorization: - - Token 7b613506d5afa062fe33c9cd825f106c718b82a0 - Connection: - - keep-alive - Content-Length: - - '158' - Content-Type: - - application/json - Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgI:Y2nBj16y8Buj0irVpJeFn0fNguq_rXv9BmdK5o64fsw - User-Agent: - - python-requests/2.28.0 - method: POST - uri: http://localhost:8080/api/projects/24/import?return_task_ids=1 - response: - body: - string: '{"task_count":2,"annotation_count":0,"prediction_count":0,"duration":0.1579442024230957,"file_upload_ids":[],"could_be_tasks_list":false,"found_formats":[],"data_columns":[],"task_ids":[1,2]}' - headers: - Allow: - - POST, OPTIONS - Content-Language: - - en-us - Content-Length: - - '191' - Content-Type: - - application/json - Date: - - Thu, 01 Jun 2023 21:18:02 GMT - Referrer-Policy: - - same-origin - Server: - - WSGIServer/0.2 CPython/3.8.15 - Set-Cookie: - - sessionid=eyJ1aWQiOiI0MzJmMWRjMC01MGNkLTQyMGEtYjgyYy0wM2JlMjEzOTNlMzYiLCJvcmdhbml6YXRpb25fcGsiOjF9:1q4pgI:Y2nBj16y8Buj0irVpJeFn0fNguq_rXv9BmdK5o64fsw; - expires=Thu, 15 Jun 2023 21:18:02 GMT; HttpOnly; Max-Age=1209600; Path=/; - SameSite=Lax - Vary: - - Accept-Language, Cookie, Origin - X-Content-Type-Options: - - nosniff - status: - code: 201 - message: Created -version: 1 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index af66a65e41..033f217a2c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.3" # pragma: no cover +__version__ = "0.17.4" # pragma: no cover diff --git a/unstructured/staging/label_studio.py b/unstructured/staging/label_studio.py index 407edcf386..bdb3989e03 100644 --- a/unstructured/staging/label_studio.py +++ b/unstructured/staging/label_studio.py @@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Union from unstructured.documents.elements import Element +from unstructured.logger import logger LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]] @@ -118,6 +119,11 @@ def stage_for_label_studio( ) -> LABEL_STUDIO_TYPE: """Converts the document to the format required for upload to LabelStudio. ref: https://labelstud.io/guide/tasks.html#Example-JSON-format""" + # NOTE(alan): The background for this is that we test this function with the package + # label_studio_sdk, and we're stuck on a version with a high CVE unless we drop to version 1 of + # numpy. The least bad way forward was to deprecate the function, remove the test, and drop the + # dependency. + logger.warning("This function is deprecated, and is unlikely to be maintained in the future.") if annotations is not None and len(elements) != len(annotations): raise ValueError("The length of elements and annotations must match.") if predictions is not None and len(elements) != len(predictions): From 9a239fa18b5bc55bfc81029160c00b0265b9d4b0 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Thu, 27 Mar 2025 13:41:11 -0500 Subject: [PATCH 04/15] build: remove test and dev deps from docker image (#3969) Removed the dependencies contained in `test.txt`, `dev.txt`, and `constraints.txt` from the things that get installed in the docker image. In order to keep testing the image (running the tests), I added a step to the `docker-test` make target to install `test.txt` and `dev.txt`. Thus we presumably get a smaller image (probably not much smaller), reduce the dependency chain or our images, and have less exposure to vulnerabilities while still testing as robustly as before. Incidentally, I removed the `Dockerfile` for our ubuntu image, since it made reference to non-existent make targets, which tells me it's stale and wasn't being used. ### Review: - Reviewer should ensure the dev and test dependencies are not being installed in the docker image. One way to check is to check the logs in CI, and note, e.g. that [this](https://github.com/Unstructured-IO/unstructured/actions/runs/14112971425/job/39536304012#step:3:1700) is the first reference to `pytest` in the docker build and test logs, after the image build is completed. - Reviewer should ensure docker image is still being tested in CI and is passing. --- CHANGELOG.md | 10 ++++++++++ Dockerfile | 2 +- Makefile | 3 ++- docker/rockylinux-9.2/Dockerfile | 2 +- docker/ubuntu-22/Dockerfile | 26 -------------------------- unstructured/__version__.py | 2 +- 6 files changed, 15 insertions(+), 30 deletions(-) delete mode 100644 docker/ubuntu-22/Dockerfile diff --git a/CHANGELOG.md b/CHANGELOG.md index 875f098612..17cb66d3a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.17.5 + +### Enhancements +- **Remove test and dev dependencies from docker image.** This reduces the docker image size slightly and reduces potential security vulnerabilities. + +### Features + +### Fixes +- **Removed out of date ubuntu Dockerfile.** The Dockerfile was out of date and non-functional. + ## 0.17.4 ### Enhancements diff --git a/Dockerfile b/Dockerfile index 69b96d3e67..e4d7ebd5be 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ ENV TESSDATA_PREFIX=/usr/local/share/tessdata ENV NLTK_DATA=/home/notebook-user/nltk_data # Install Python dependencies and download required NLTK packages -RUN find requirements/ -type f -name "*.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \ +RUN find requirements/ -type f -name "*.txt" ! -name "test.txt" ! -name "dev.txt" ! -name "constraints.txt" -exec $PIP install --no-cache-dir --user -r '{}' ';' && \ mkdir -p ${NLTK_DATA} && \ $PYTHON -m nltk.downloader -d ${NLTK_DATA} punkt_tab averaged_perceptron_tagger_eng && \ $PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \ diff --git a/Makefile b/Makefile index c5208c365c..80600a051a 100644 --- a/Makefile +++ b/Makefile @@ -310,7 +310,8 @@ docker-test: -v ${CURRENT_DIR}/test_unstructured_ingest:/home/notebook-user/test_unstructured_ingest \ $(if $(wildcard uns_test_env_file),--env-file uns_test_env_file,) \ $(DOCKER_IMAGE) \ - bash -c "CI=$(CI) \ + bash -c "pip install -r requirements/test.txt -r requirements/dev.txt && \ + CI=$(CI) \ UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \ python3 -m pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)" diff --git a/docker/rockylinux-9.2/Dockerfile b/docker/rockylinux-9.2/Dockerfile index 3bce864e37..051294dc96 100644 --- a/docker/rockylinux-9.2/Dockerfile +++ b/docker/rockylinux-9.2/Dockerfile @@ -22,7 +22,7 @@ COPY requirements requirements RUN python3.10 -m pip install pip==${PIP_VERSION} && \ dnf -y groupinstall "Development Tools" && \ - find requirements/ -type f -name "*.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \ + find requirements/ -type f -name "*.txt" ! -name "test.txt" ! -name "dev.txt" ! -name "constraints.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \ dnf -y groupremove "Development Tools" && \ dnf clean all diff --git a/docker/ubuntu-22/Dockerfile b/docker/ubuntu-22/Dockerfile deleted file mode 100644 index 059bfc85bb..0000000000 --- a/docker/ubuntu-22/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# Dockerfile that approximates the CI image -# -# Mainly useful for updating test-ingest fixtures - -FROM ubuntu:22.04 - -COPY scripts/setup_ubuntu.sh scripts/setup_ubuntu.sh - -RUN bash scripts/setup_ubuntu.sh root - -COPY requirements/ requirements/ -COPY Makefile Makefile - -SHELL ["/bin/bash", "-c"] - -RUN source ~/.bashrc && pyenv virtualenv 3.10 unstructured && \ - source ~/.pyenv/versions/unstructured/bin/activate && \ - make install-ci && \ - make install-ingest-s3 && \ - make install-ingest-azure && \ - make install-ingest-github && \ - make install-ingest-gitlab && \ - make install-ingest-wikipedia && \ - make install-ingest-discord && \ - make install install-ingest-slack && \ - make install-ingest-confluence diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 033f217a2c..b243ca7861 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.4" # pragma: no cover +__version__ = "0.17.5" # pragma: no cover From 19fc1fcc72c30dc3618da23dab9bc1c4da79e15a Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Mon, 31 Mar 2025 09:45:01 -0700 Subject: [PATCH 05/15] feat: convenience unstructured-get-json.sh update (#3971) * script now supports: * the --vlm flag, to process the document with the VLM strategy * optionally takes --vlm-model, --vlm-provider args * optionally also writes .html outputs by converting unstructured .json output * optionally opens those .html outputs in a browser Tested with: ``` unstructured-get-json.sh --write-html --open-html --fast layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --hi-res layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --ocr-only layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm --vlm-provider openai --vlm-model gpt-4o layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm --vlm-provider vertexai --vlm-model gemini-2.0-flash-001 layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm --vlm-provider anthropic --vlm-model claude-3-5-sonnet-20241022 layout-parser-paper-p2.pdf ``` [layout-parser-paper-p2.pdf](https://github.com/user-attachments/files/19514007/layout-parser-paper-p2.pdf) --- .gitignore | 3 +- CHANGELOG.md | 8 ++ scripts/user/unstructured-get-json.sh | 118 ++++++++++++++++++++++++++ unstructured/__version__.py | 2 +- 4 files changed, 129 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index e8e4471465..87f4fc72bd 100644 --- a/.gitignore +++ b/.gitignore @@ -208,4 +208,5 @@ outputhtmldiff.txt metricsdiff.txt # analysis -annotated/ \ No newline at end of file +annotated/ +.aider* diff --git a/CHANGELOG.md b/CHANGELOG.md index 17cb66d3a6..ad3afdfc3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.17.6-dev0 + +### Enhancements + +### Features + +### Fixes + ## 0.17.5 ### Enhancements diff --git a/scripts/user/unstructured-get-json.sh b/scripts/user/unstructured-get-json.sh index 74ea031390..2ef0ac4eff 100755 --- a/scripts/user/unstructured-get-json.sh +++ b/scripts/user/unstructured-get-json.sh @@ -16,12 +16,20 @@ Options: --hi-res hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR --fast fast strategy: No OCR, just extract embedded text --ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation. + --vlm vlm strategy: Use Vision Language Model for processing + --vlm-provider Specify the VLM model provider + (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy) + --vlm-model Specify the VLM model when using + (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy) --tables Enable table extraction: tables are represented as html in metadata --images Include base64images in json --coordinates Include coordinates in the output --trace Enable trace logging for debugging, useful to cut and paste the executed curl call --verbose Enable verbose logging including printing first 8 elements to stdout --s3 Write the resulting output to s3 (like a pastebin) + --write-html Convert JSON output to HTML. Set the env var $UNST_WRITE_HTML to skip providing this option. + --open-html Automatically open HTML output in browser (macOS only) if --write-html. + Set the env var UNST_AUTO_OPEN_HTML=true to skip providing this option. --help Display this help and exit. @@ -64,6 +72,7 @@ copy_to_clipboard() { HI_RES=false FAST=false OCR_ONLY=false +VLM=false STRATEGY="" VERBOSE=false TRACE=false @@ -72,6 +81,10 @@ FREEMIUM=false TABLES=true IMAGES=false S3="" +WRITE_HTML=${UNST_WRITE_HTML:-false} +OPEN_HTML=${UNST_AUTO_OPEN_HTML:-false} +VLM_PROVIDER="" +VLM_MODEL="" while [[ "$#" -gt 0 ]]; do case "$1" in @@ -87,6 +100,28 @@ while [[ "$#" -gt 0 ]]; do OCR_ONLY=true shift ;; + --vlm) + VLM=true + shift + ;; + --vlm-provider) + if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then + VLM_PROVIDER=$2 + shift 2 + else + echo "Error: Argument for $1 is missing" >&2 + exit 1 + fi + ;; + --vlm-model) + if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then + VLM_MODEL=$2 + shift 2 + else + echo "Error: Argument for $1 is missing" >&2 + exit 1 + fi + ;; --trace) TRACE=true shift @@ -99,6 +134,14 @@ while [[ "$#" -gt 0 ]]; do S3=true shift ;; + --write-html) + WRITE_HTML=true + shift + ;; + --open-html) + OPEN_HTML=true + shift + ;; --tables) TABLES=true shift @@ -140,6 +183,24 @@ if [ -z "$INPUT" ]; then exit 1 fi +# Check for strategy conflicts after all arguments are processed +STRATEGY_COUNT=0 +$HI_RES && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) +$FAST && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) +$OCR_ONLY && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) +$VLM && STRATEGY_COUNT=$((STRATEGY_COUNT + 1)) + +if [ "$STRATEGY_COUNT" -gt 1 ]; then + echo "Error: Only one strategy option (--hi-res, --fast, --ocr-only, --vlm) can be specified at a time." + exit 1 +fi + +# Check if vlm-provider or vlm-model are provided without --vlm +if { [ -n "$VLM_PROVIDER" ] || [ -n "$VLM_MODEL" ]; } && ! $VLM; then + echo "Error: --vlm-provider or --vlm-model can only be used with --vlm strategy." + exit 1 +fi + if $TRACE; then set -x fi @@ -175,6 +236,25 @@ elif $OCR_ONLY; then STRATEGY="-ocr-only" JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json CURL_STRATEGY=(-F "strategy=ocr_only") +elif $VLM; then + if $VERBOSE; then echo "Sending API request with vlm strategy"; fi + STRATEGY="-vlm" + # Add provider and model to filename if specified + if [ -n "$VLM_PROVIDER" ] && [ -n "$VLM_MODEL" ]; then + STRATEGY="-vlm-${VLM_PROVIDER}-${VLM_MODEL}" + elif [ -n "$VLM_PROVIDER" ]; then + STRATEGY="-vlm-${VLM_PROVIDER}" + elif [ -n "$VLM_MODEL" ]; then + STRATEGY="-vlm-model-${VLM_MODEL}" + fi + JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json + CURL_STRATEGY=(-F "strategy=vlm") + if [ -n "$VLM_PROVIDER" ]; then + CURL_STRATEGY+=(-F "vlm_model_provider=$VLM_PROVIDER") + fi + if [ -n "$VLM_MODEL" ]; then + CURL_STRATEGY+=(-F "vlm_model=$VLM_MODEL") + fi else if $VERBOSE; then echo "Sending API request WITHOUT a strategy"; fi JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json @@ -213,6 +293,44 @@ else fi echo "JSON Output file: ${JSON_OUTPUT_FILEPATH}" +# Convert JSON to HTML if requested +if [ "$WRITE_HTML" = true ]; then + HTML_OUTPUT_FILEPATH=${JSON_OUTPUT_FILEPATH%.json}.html + + if $VLM; then + # VLM output has all metadata.text_as_html fields defined, so + # create HTML directly from the metadata.text_as_html fields + { + echo "" + echo "" + echo "" + echo " " + echo " " + echo " Codestin Search App" + echo " " + echo "" + echo "" + jq -r 'map(.metadata.text_as_html) | join("\n")' "${JSON_OUTPUT_FILEPATH}" + echo "" + echo "" + } >"${HTML_OUTPUT_FILEPATH}" + echo "HTML written directly from metadata.text_as_html fields to: ${HTML_OUTPUT_FILEPATH}" + else + # most elements will not have metadata.text_as_html defined (by design on Table elements do), + # so use the unstructured library's python script for the conversion. + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../html/elements_json_to_html.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}" + echo "HTML written using Python script to: ${HTML_OUTPUT_FILEPATH}" + fi + + # Open HTML file in browser if requested and on macOS + if [ "$OPEN_HTML" = true ] && [ "$(uname)" == "Darwin" ]; then + open "${HTML_OUTPUT_FILEPATH}" + fi +fi + # write .json output to s3 location if [ -n "$S3" ]; then diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b243ca7861..db302d22ce 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.5" # pragma: no cover +__version__ = "0.17.6-dev0" # pragma: no cover From c6b8ed4290891b997a40f4477151a3353753d07e Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Mon, 31 Mar 2025 22:18:57 -0700 Subject: [PATCH 06/15] chore: allow changing default output dir for unstructured-get-json.sh (#3973) --- scripts/user/unstructured-get-json.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/user/unstructured-get-json.sh b/scripts/user/unstructured-get-json.sh index 2ef0ac4eff..4fb21263a3 100755 --- a/scripts/user/unstructured-get-json.sh +++ b/scripts/user/unstructured-get-json.sh @@ -50,8 +50,8 @@ fi IMAGE_BLOCK_TYPES=${IMAGE_BLOCK_TYPES:-'"image", "table"'} API_KEY=${UNST_API_KEY:-""} -TMP_DOWNLOADS_DIR="$HOME/tmp/unst-downloads" -TMP_OUTPUTS_DIR="$HOME/tmp/unst-outputs" +TMP_DOWNLOADS_DIR=${UNST_SCRIPT_DOWNLOADS_DIR:-"$HOME/tmp/unst-downloads"} +TMP_OUTPUTS_DIR=${UNST_SCRIPT_JSON_OUTPUTS_DIR:-"$HOME/tmp/unst-outputs"} # only applicable if writing .json output files to S3 when using --s3, e.g. s3://bucket-name/path/ S3_URI_PREFIX=${UNST_S3_JSON_OUTPUT_URI:-""} # e.g. us-east-2, used to provide http links for above location From 8fc41811eb1d425a772b028b1cb01a4d6c90a788 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Thu, 3 Apr 2025 15:42:25 -0700 Subject: [PATCH 07/15] chore: add html path to ingest-test-fixtures-update-pr (#3977) This should allow the `Ingest Test Fixtures Update PR` workflow to also update expected html outputs. E.g., before the change, the .html files would be left unmodified: ![image](https://github.com/user-attachments/assets/fa14c1a5-39bd-4e32-b4b9-9552eb312de1) https://github.com/Unstructured-IO/unstructured/actions/runs/14234877547/job/39892334672 --- .github/workflows/ingest-test-fixtures-update-pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 317f46ec0c..33402ae260 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -139,6 +139,7 @@ jobs: token: ${{ secrets.GH_CREATE_PR_TOKEN }} add-paths: | test_unstructured_ingest/expected-structured-output + test_unstructured_ingest/expected-structured-output-html test_unstructured_ingest/metrics commit-message: "Update ingest test fixtures" branch: ${{ env.BRANCH_NAME }} From dfa17bd3a0c476dce571b8b493dd2ff80ddaebc1 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Fri, 4 Apr 2025 14:38:23 -0700 Subject: [PATCH 08/15] fix: hi_res PDF parsing: only uncategorized text for extracted elements (#3975) --- CHANGELOG.md | 3 +- .../partition/pdf_image/test_pdf.py | 4 +- test_unstructured/partition/test_msg.py | 2 +- .../biomed-api/65/11/main.PMC6312790.pdf.html | 30 +++---- .../biomed-api/75/29/main.PMC6312793.pdf.html | 28 +++--- .../07/07/sbaa031.073.PMC7234218.pdf.html | 4 +- .../recalibrating-risk-report.pdf.html | 86 +++++++++---------- .../layout-parser-paper-with-table.jpg.html | 4 +- .../layout-parser-paper.pdf.html | 54 ++++++------ .../biomed-api/65/11/main.PMC6312790.pdf.json | 20 ++--- .../biomed-api/75/29/main.PMC6312793.pdf.json | 18 ++-- .../07/07/sbaa031.073.PMC7234218.pdf.json | 2 +- .../recalibrating-risk-report.pdf.json | 44 +++++----- .../layout-parser-paper-with-table.jpg.json | 2 +- .../layout-parser-paper.pdf.json | 30 +++---- unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 5 +- 17 files changed, 171 insertions(+), 167 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad3afdfc3e..baa69aae9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,11 @@ -## 0.17.6-dev0 +## 0.17.6-dev1 ### Enhancements ### Features ### Fixes +- **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) ## 0.17.5 diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 6d1145eb80..7a0c8ff29c 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -823,8 +823,8 @@ def test_partition_categorization_backup(): example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES, ) - # Should have changed the element class from Text to Title - assert isinstance(elements[0], Title) + # Should NOT have changed the element class from Text to Title + assert isinstance(elements[0], Text) assert elements[0].text == text diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index d1d66876ed..94b12d5578 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -141,7 +141,7 @@ def test_partition_msg_can_process_attachments(): "Text", "Text", "Image", - "Title", + "Text", "Text", "Title", "Title", diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html index a55cccdbbd..210109c06e 100644 --- a/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html @@ -14,9 +14,9 @@

Contents lists available at ScienceDirect

-

+

Data in Brief -

+

journal homepage: www.elsevier.com/locate/dib

@@ -28,19 +28,19 @@

Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment

-

+

(Jee -

+

Omotayo Sanni n, Abimbola Patricia I. Popoola

Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa

-

+

a r t i c l e i n f o

-

+

a b s t r a c t

@@ -88,19 +88,19 @@

Value of the data

-

+

© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel

  • Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment.
  • -

    +

    © The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316

  • can be used as basis in determining the inhibitive performance of the same inhibitor in other environments.
  • -

    +

    © The data can be used to examine the relationship between the process variable as it affect the

  • @@ -152,9 +152,9 @@

    Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 —0.9393 0.0003 24.0910 2.8163 2 1.9460 0.0596 —0.8276 0.0002 121.440 1.5054 4 0.0163 0.2369 —0.8825 0.0001 42.121 0.9476 6 0.3233 0.0540 —0.8027 5.39E-05 373.180 0.4318 8 0.1240 0.0556 —0.5896 5.46E-05 305.650 0.3772 10 0.0382 0.0086 —0.5356 1.24E-05 246.080 0.0919
    -

    +

    rate (mm/year) -

    +

    The plot of inhibitor concentration over degree of surface coverage versus inhibitor concentration gives a straight line as shown in Fig. 5. The strong correlation reveals that egg shell adsorption on stainless surface in 0.5 M H2SO4 follow Langmuir adsorption isotherm. Figs. 6–8 show the SEM/EDX surface morphology analysis of stainless steel. Figs. 7 and 8 are the SEM/EDX images of the stainless steel specimens without and with inhibitor after weight loss experiment in sulphuric acid medium. The stainless steel surface corrosion product layer in the absence of inhibitor was porous and as a result gives no corrosion protection. With the presence of ES, corrosion damage was minimized, with an evidence of ES present on the metal surface as shown in Fig. 8.

    @@ -232,12 +232,12 @@

    The potentiodynamic polarization method was performed on the prepared test samples immersed in 0.5 M H2SO4 solution in the presence and absence of different ES concentrations. A three electrode system was used; stainless steel Type 316 plate as working electrode with an exposed area of 1.0 cm2, platinum rod as counter electrode and silver chloride electrode as reference electrode. The electrode was polished, degreased in acetone and thoroughly rinsed with distilled water before the experiment. Current density against applied potential was plotted. The slope of the linear part in anodic and cathodic plots gives anodic and cathodic constants according to the Stern–Geary equation, and the

    -

    +

    ð2Þ -

    -

    +

    +

    ð3Þ -

    +

    O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457
    diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html index bb95afd2b2..aabc7233cc 100644 --- a/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html @@ -14,9 +14,9 @@

    Contents lists available at ScienceDirect

    -

    +

    Data in Brief -

    +

    journal homepage: www.elsevier.com/locate/dib

    @@ -28,9 +28,9 @@

    A benchmark dataset for the multiple depot vehicle scheduling problem

    -

    +

    (eee -

    +

    Sarang Kulkarni a,b,c,n, Mohan Krishnamoorthy d,e, Abhiram Ranade f, Andreas T. Ernst c, Rahul Patil b

    @@ -52,16 +52,16 @@

    e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072,

    -

    +

    Australia -

    +

    f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India

    -

    +

    a r t i c l e i n f o

    -

    +

    a b s t r a c t

    @@ -106,13 +106,13 @@

  • © The data provide all the information that is required to model the MDVSP by using the existing mathematical formulations.
  • -

    +

    e All the problem instances are available for use without any restrictions.

  • e The benchmark solutions and solution time for the problem instances are presented in [3] and can be used for the comparison.
  • -

    +

    © The dataset includes a program that can generate similar problem instances of different sizes.

    @@ -121,9 +121,9 @@

    The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number for the size, ‘ðm;nÞ’, respectively. For example, the problem instance, ‘RN-8–1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm;nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. For each problem instance, the following information is provided:

    -

    +

    The number of depots mð -

    +

    Þ,

    @@ -187,9 +187,9 @@

    Instance size (m, n) Average number of Locations Times Vehicles (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 (16, 3000) 1087.20 1101.60 1284.60 2,684,983.60
    -

    +

    Possible empty travels -

    +

    S. Kulkarni et al. / Data in Brief 22 (2019) 484–487
    diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html index 0862a71a27..eabce53c29 100644 --- a/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html @@ -76,8 +76,8 @@

    Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2,

    -

    +

    AQ3 -

    +

    diff --git a/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html b/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html index c17be23f5a..517f7a3608 100644 --- a/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html @@ -11,7 +11,7 @@

    WORLD ASSOCIATION

    -

    +

    Recalibrating risk

    @@ -89,69 +89,69 @@

    In terms of accidents, hydropower is the deadliest electricity generator, mostly due to collapsing dams and the consequences of flooding. The Banqiao Dam failure in 1975 led to at least 26,000 people drowning, and as many as 150,000 deaths resulting from the secondary effects of the accident. In comparison, radiation exposure following Chernobyl caused 54 deaths2, while no casualties due to radiation are likely to occur from the accident at Fukushima Daiichi.

    25  24.6  20  18.4  e  15  10  5  4.6  2.8  0  Coal  Oil  Bio m ass  Natural gas  0.07  Wind  0.04  Hydropower  0.02  Solar  0.01  Nuclear -

    +

    r -

    -

    +

    +

    a -

    -

    +

    +

    e -

    -

    +

    +

    y -

    -

    +

    +

    W -

    -

    +

    +

    T -

    -

    +

    +

    r -

    -

    +

    +

    e -

    -

    +

    +

    p -

    -

    +

    +

    s -

    +

    8

    -

    +

    e -

    -

    +

    +

    i -

    -

    +

    +

    t -

    -

    +

    +

    i -

    -

    +

    +

    l -

    -

    +

    +

    S -

    -

    +

    +

    a -

    -

    +

    +

    t -

    -

    +

    +

    a -

    -

    +

    +

    F -

    +

    Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution3

    @@ -251,9 +251,9 @@

  • World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries
  • -

    +

    i -

    +

  • ii BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712
  • diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html index ccc0784c71..dbf342486a 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html @@ -114,9 +114,9 @@

  • import layoutparser as lp
  • -

    +

    wwe -

    +

  • image = cv2.imread("image_file") # load images
  • diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html index 84e2672182..eca4025c8d 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html @@ -22,24 +22,24 @@
    2 n u J 1 2 ] V C . s c [ 2 v 8 4 3 5 1 . 3 0 1 2 :
    -

    +

    v -

    -

    +

    +

    arXiv -

    -

    +

    +

    i -

    -

    +

    +

    X -

    -

    +

    +

    r -

    -

    +

    +

    a -

    +

    LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis

    @@ -115,28 +115,28 @@

    -

    +

    7 https://ocr-d.de/en/about -

    -

    +

    +

    8 https://github.com/BobLd/DocumentLayoutAnalysis -

    -

    +

    +

    9 https://github.com/leonlulu/DeepLayout -

    -

    +

    +

    10 https://github.com/hpanwar08/detectron2 -

    -

    +

    +

    11 https://github.com/JaidedAI/EasyOCR -

    -

    +

    +

    12 https://github.com/PaddlePaddle/PaddleOCR -

    +

    4

    -

    +

    Z. Shen et al.

    Efficient Data Annotation Model Customization Document Images Community Platform ‘a >) ¥ DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | ——= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY @@ -263,7 +263,7 @@

    6

    -

    +

    Z. Shen et al.

    - ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff @@ -303,7 +303,7 @@

    LayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained with the Connectionist Temporal Classification (CTC) loss [10]. It can be used like the other OCR modules, and can be easily trained on customized datasets.

    -

    +

    13 This is also available in the LayoutParser documentation pages.

  • diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 6f6c30b2a8..c26c406734 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -63,7 +63,7 @@ "page_number": 1 }, "text": "Data in Brief", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "97e80c6e7dc2754c9083b263ff65039e", @@ -148,7 +148,7 @@ "page_number": 1 }, "text": "(Jee", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "bddd1cbc864e9b44cc0715a1cccf8dbc", @@ -187,7 +187,7 @@ "page_number": 1 }, "text": "a r t i c l e i n f o", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "b9e48f235de5b531427187eb6ea135fe", @@ -200,7 +200,7 @@ "page_number": 1 }, "text": "a b s t r a c t", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "911bfead9b546998812e2d1d615ecc87", @@ -432,7 +432,7 @@ "page_number": 2 }, "text": "© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "afed004de4c50d761640b6c18729a988", @@ -458,7 +458,7 @@ "page_number": 2 }, "text": "© The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "cb6e8acb9c24820b59f8973cc236ef35", @@ -484,7 +484,7 @@ "page_number": 2 }, "text": "© The data can be used to examine the relationship between the process variable as it affect the", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "e1f7e635d8739a97d8d0000ba8004f61", @@ -744,7 +744,7 @@ "page_number": 4 }, "text": "rate (mm/year)", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "3a5534c2aafc2d8a4c0b65d530d00ab3", @@ -1134,7 +1134,7 @@ "page_number": 6 }, "text": "ð2Þ", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "cff55ae1916232dbda5239f59c897cb9", @@ -1147,7 +1147,7 @@ "page_number": 6 }, "text": "ð3Þ", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "e40c3ee561b10ca5b7a76900c8d5b263", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index 1fab6122c1..17e0923127 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -63,7 +63,7 @@ "page_number": 1 }, "text": "Data in Brief", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "c1b3d4f53698b892fcc23fc10a72e6fb", @@ -148,7 +148,7 @@ "page_number": 1 }, "text": "(eee", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "0cda4eb20070fdf01ec0d47b2a550241", @@ -252,7 +252,7 @@ "page_number": 1 }, "text": "Australia", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "85875ebbc1de554e92edc54674add1d5", @@ -278,7 +278,7 @@ "page_number": 1 }, "text": "a r t i c l e i n f o", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "4f3f69dd17ddae776c656ec73d9837ae", @@ -291,7 +291,7 @@ "page_number": 1 }, "text": "a b s t r a c t", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "34522460857b10c63d8c2c8d2fbb3087", @@ -534,7 +534,7 @@ "page_number": 2 }, "text": "e All the problem instances are available for use without any restrictions.", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "d401597b8ff2854bfb89f2833d02a763", @@ -560,7 +560,7 @@ "page_number": 2 }, "text": "© The dataset includes a program that can generate similar problem instances of different sizes.", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "fb765d6762e6a423cb8b9dab27359732", @@ -606,7 +606,7 @@ "page_number": 2 }, "text": "The number of depots mð", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "320f6d28582c354d35673c2a4119851f", @@ -892,7 +892,7 @@ "page_number": 3 }, "text": "Possible empty travels", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "fa23407a7c3c99ae3b6fb79034698807", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json index 3641fcd434..67cd5fb088 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json @@ -309,6 +309,6 @@ "page_number": 1 }, "text": "AQ3", - "type": "Title" + "type": "UncategorizedText" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json index 49e17cb5fc..6e7d6aa5f3 100644 --- a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json @@ -186,7 +186,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "7137c1e14141fad3ad306fe68918a967", "text": "Recalibrating risk", "metadata": { @@ -2790,7 +2790,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "a8706e82b3f90cffc996a24348e3b670", "text": "r", "metadata": { @@ -2883,7 +2883,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "da631c23500655c51b9311a61f55744f", "text": "a", "metadata": { @@ -2976,7 +2976,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "d78a11e9e55235934c3a4922053c68e5", "text": "e", "metadata": { @@ -3069,7 +3069,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "8d14df8b7fd7744365fbf8e02d69415a", "text": "y", "metadata": { @@ -3162,7 +3162,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "f4df01bee1b8ffb973ac8539649c5189", "text": "W", "metadata": { @@ -3255,7 +3255,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "b733cf49de269e22bed7c9883b958669", "text": "T", "metadata": { @@ -3348,7 +3348,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "c4b47d788b26c3d5c62ad462ed3ca2db", "text": "r", "metadata": { @@ -3441,7 +3441,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "bff4435574259239761670b31432cc8a", "text": "e", "metadata": { @@ -3534,7 +3534,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "8ba15a3a71eb0bb689c582098cce6730", "text": "p", "metadata": { @@ -3627,7 +3627,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "5fde097ba00ad7647206ae11c721d28c", "text": "s", "metadata": { @@ -3813,7 +3813,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "81f1f3b9da6df38d938bf7871fa069b5", "text": "e", "metadata": { @@ -3906,7 +3906,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "aa4a79651a9a0087b66fcc40a2213113", "text": "i", "metadata": { @@ -3999,7 +3999,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "6d1c0d05d3a424b43d9572188a76c2d4", "text": "t", "metadata": { @@ -4092,7 +4092,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "392a17b2f3eba46f4bcf078e0b204514", "text": "i", "metadata": { @@ -4185,7 +4185,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "d24a9a771e46fdd6b269f1ecaf0b5eec", "text": "l", "metadata": { @@ -4278,7 +4278,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "9dc4537afa8ae0b959a542f9ba5c1e03", "text": "S", "metadata": { @@ -4371,7 +4371,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "919dac2487a4c860747318a132a54a72", "text": "a", "metadata": { @@ -4464,7 +4464,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "04ee5d05c3fcfffd945762e803478600", "text": "t", "metadata": { @@ -4557,7 +4557,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "63dabde368e2cf310d20a885fe50314a", "text": "a", "metadata": { @@ -4650,7 +4650,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "796538927664e4d87312c428469428f5", "text": "F", "metadata": { @@ -8184,7 +8184,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "a95a2add68d668b944cc332c88ea721e", "text": "i", "metadata": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json index 147e62d128..c71cf50967 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json @@ -177,7 +177,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "9d40bf1b2e2af1692f5689a1c44ab2ae", "text": "wwe", "metadata": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index b9d9f35d17..3f42ca335d 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -110,7 +110,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "4608f9aa33a0cab158565817b0d15743", "text": "v", "metadata": { @@ -132,7 +132,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "6f69e5f921907e689f1a52bd84282b31", "text": "arXiv", "metadata": { @@ -154,7 +154,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "ed4e590932b333f40d0e1367b6b0e32e", "text": "i", "metadata": { @@ -176,7 +176,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "8cb024fb60457b7c572b167801037f75", "text": "X", "metadata": { @@ -198,7 +198,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "c202bdacd2daf4c52fa3a6ddd64a0728", "text": "r", "metadata": { @@ -220,7 +220,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "3db474893ec321c81ef9d1a2afd5f660", "text": "a", "metadata": { @@ -1022,7 +1022,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "db639db124b6064248de0c0dc71510a4", "text": "7 https://ocr-d.de/en/about", "metadata": { @@ -1044,7 +1044,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "d881ce84f017d89f6e35e2bc4b133bfc", "text": "8 https://github.com/BobLd/DocumentLayoutAnalysis", "metadata": { @@ -1066,7 +1066,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "9b96c128deddda1a32c739a2df157496", "text": "9 https://github.com/leonlulu/DeepLayout", "metadata": { @@ -1088,7 +1088,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "5cf72e821375f4480a1529bef97608ef", "text": "10 https://github.com/hpanwar08/detectron2", "metadata": { @@ -1110,7 +1110,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "4ab94e79eedc3a7ac498aaf737ca8878", "text": "11 https://github.com/JaidedAI/EasyOCR", "metadata": { @@ -1132,7 +1132,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "460b163c13ad7cad4fce325820a76481", "text": "12 https://github.com/PaddlePaddle/PaddleOCR", "metadata": { @@ -1176,7 +1176,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "92c4289ad4af7c0793e40d5662707e0a", "text": "Z. Shen et al.", "metadata": { @@ -1739,7 +1739,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "710ac103981c6363195774b02ee582d4", "text": "Z. Shen et al.", "metadata": { @@ -2083,7 +2083,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "a2a0a2ef0279f0710f3cd34474ca8645", "text": "13 This is also available in the LayoutParser documentation pages.", "metadata": { diff --git a/unstructured/__version__.py b/unstructured/__version__.py index db302d22ce..1c6678160c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.6-dev0" # pragma: no cover +__version__ = "0.17.6-dev1" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index e0e64854d4..d38658ed64 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -362,7 +362,10 @@ def partition_pdf_or_image( table_ocr_agent=table_ocr_agent, **kwargs, ) - out_elements = _process_uncategorized_text_elements(elements) + # NOTE(crag): do not call _process_uncategorized_text_elements here, because + # extracted elements (which are text blocks outside of OD-determined blocks) + # are likely not Titles and should not be identified as such. + return elements elif strategy == PartitionStrategy.FAST: out_elements = _partition_pdf_with_pdfparser( From d570f4624bb8c5dc75f0009775925fdaa40defb0 Mon Sep 17 00:00:00 2001 From: Philippe PRADOS Date: Mon, 7 Apr 2025 17:57:20 +0200 Subject: [PATCH 09/15] Fix sort_page_element. ensures that sorting is stable and not random. (#3978) The sort_page_element() use the element id to sort the elements. Two executions of the same code, on the same file, produce different results. The order of the elements is random. This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. --- CHANGELOG.md | 3 +++ .../partition/pdf_image/test_pdf.py | 21 +++++++++++++++++++ unstructured/partition/utils/sorting.py | 1 - 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index baa69aae9f..4da58bbc9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ ### Features ### Fixes +- The sort_page_element() use the element id to sort the elements. +Two executions of the same code, on the same file, produce different results. The order of the elements is random. +This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) ## 0.17.5 diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 7a0c8ff29c..70eec35fd7 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1603,3 +1603,24 @@ def test_partition_pdf_with_specified_ocr_agents(mocker): assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT} assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE} + + +def test_reproductible_pdf_loader(): + from glob import glob + + for f in glob(example_doc_path("pdf/layout-parser-paper.pdf")): + elements_1 = pdf.partition_pdf( + filename=f, + strategy=PartitionStrategy.AUTO, + infer_table_structure=False, + ) + for _ in range(4): + elements_2 = pdf.partition_pdf( + filename=f, + strategy=PartitionStrategy.AUTO, + infer_table_structure=False, + ) + for e1, e2 in zip(elements_1, elements_2): + assert e1.text == e2.text, f"load two time {f=} return differents results" + else: + break diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py index 8cdc885dd1..59d550958b 100644 --- a/unstructured/partition/utils/sorting.py +++ b/unstructured/partition/utils/sorting.py @@ -179,7 +179,6 @@ def _coords_ok(strict_points: bool): key=lambda el: ( el.metadata.coordinates.points[0][1] if el.metadata.coordinates else float("inf"), el.metadata.coordinates.points[0][0] if el.metadata.coordinates else float("inf"), - el.id, ), ) else: From 27f503ce3131ee01006205124c2e6484cf0510c5 Mon Sep 17 00:00:00 2001 From: Nathan <168383951+Nathan-GoSupply@users.noreply.github.com> Date: Tue, 8 Apr 2025 17:47:24 +1000 Subject: [PATCH 10/15] Update pdfminer_utils.py (#3974) Fix for 'PSSyntaxError' import error: "cannot import name 'PSSyntaxError' from 'pdfminer.pdfparser'" Latest pdfminer-six doesn't import PSSyntaxError into `pdfminer.pdfparser` anymore. It must now be directly imported from its source (`pdfminer.psexceptions`) --- CHANGELOG.md | 1 + unstructured/partition/pdf_image/pdfminer_utils.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4da58bbc9e..62ae488af3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ This makes it impossible to write stable unit tests, for example, or to obtain r ### Fixes - **Removed out of date ubuntu Dockerfile.** The Dockerfile was out of date and non-functional. +- **Fix for 'PSSyntaxError' import error: "cannot import name 'PSSyntaxError' from 'pdfminer.pdfparser'"** PSSyntaxError needed to be imported from its source 'pdfminer.psexceptions'. ## 0.17.4 diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py index ad6f981914..3993f41ae0 100644 --- a/unstructured/partition/pdf_image/pdfminer_utils.py +++ b/unstructured/partition/pdf_image/pdfminer_utils.py @@ -6,7 +6,7 @@ from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage -from pdfminer.psparser import PSSyntaxError +from pdfminer.psexceptions import PSSyntaxError from pydantic import BaseModel from unstructured.logger import logger From fd9d796797d29648421e56880ee2938b8422c7e5 Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 28 Apr 2025 17:58:05 -0700 Subject: [PATCH 11/15] fix cve (#3989) fix critical cve for h11. supposedly 0.16.0 fixes it. --------- Co-authored-by: Yao You Co-authored-by: Austin Walker Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet --- CHANGELOG.md | 4 +- requirements/base.txt | 22 +- requirements/deps/constraints.txt | 2 + requirements/dev.txt | 6 +- requirements/extra-docx.txt | 4 +- requirements/extra-markdown.txt | 2 +- requirements/extra-odt.txt | 4 +- requirements/extra-paddleocr.txt | 33 +- requirements/extra-pdf-image.txt | 39 +- requirements/extra-pptx.txt | 8 +- requirements/huggingface.txt | 16 +- requirements/ingest/ingest.txt | 2 +- requirements/test.txt | 27 +- ...iomedical-Data-Scientists-2-pages.pdf.json | 4 +- .../azure/IRS-form-1987.pdf.json | 54 +- .../azure/IRS-form-1987.png.json | 30 +- .../azure/spring-weather.html.json | 2 +- .../handbook-1p.docx.json | 22 +- .../multi-column-2p.pdf.json | 14 +- .../fake-html-cp1252.html.json | 2 +- .../layout-parser-paper-with-table.jpg.json | 4 +- .../layout-parser-paper.pdf.json | 184 +- .../UDHR_first_article_all.txt.json | 712 +-- ...iomedical-Data-Scientists-2-pages.pdf.json | 4 +- .../biomed-api/65/11/main.PMC6312790.pdf.json | 4195 ----------------- .../biomed-api/75/29/main.PMC6312793.pdf.json | 2514 ---------- .../07/07/sbaa031.073.PMC7234218.pdf.json | 310 -- .../s3-minio/wiki_movie_plots_small.csv.json | 4 +- test_unstructured_ingest/src/against-api.sh | 5 +- test_unstructured_ingest/src/airtable-diff.sh | 5 +- .../src/airtable-large.sh | 5 +- test_unstructured_ingest/src/astradb.sh | 5 +- test_unstructured_ingest/src/azure.sh | 5 +- test_unstructured_ingest/src/biomed-api.sh | 5 +- test_unstructured_ingest/src/biomed-path.sh | 5 +- test_unstructured_ingest/src/box.sh | 9 +- .../src/confluence-diff.sh | 5 +- .../src/confluence-large.sh | 5 +- test_unstructured_ingest/src/delta-table.sh | 5 +- test_unstructured_ingest/src/discord.sh | 5 +- test_unstructured_ingest/src/dropbox.sh | 5 +- test_unstructured_ingest/src/elasticsearch.sh | 5 +- test_unstructured_ingest/src/gcs.sh | 5 +- test_unstructured_ingest/src/github.sh | 9 +- test_unstructured_ingest/src/gitlab.sh | 5 +- test_unstructured_ingest/src/google-drive.sh | 5 +- test_unstructured_ingest/src/hubspot.sh | 5 +- test_unstructured_ingest/src/jira.sh | 5 +- test_unstructured_ingest/src/kafka-local.sh | 5 +- .../src/local-embed-bedrock.sh | 5 +- .../src/local-embed-mixedbreadai.sh | 5 +- .../src/local-embed-octoai.sh | 5 +- .../src/local-embed-vertexai.sh | 5 +- .../src/local-embed-voyageai.sh | 5 +- test_unstructured_ingest/src/local-embed.sh | 5 +- .../src/local-failed-partition.sh | 5 +- .../src/local-single-file-basic-chunking.sh | 5 +- ...ocal-single-file-chunk-no-orig-elements.sh | 5 +- .../src/local-single-file-with-encoding.sh | 5 +- ...gle-file-with-pdf-infer-table-structure.sh | 5 +- .../src/local-single-file.sh | 5 +- test_unstructured_ingest/src/local.sh | 5 +- test_unstructured_ingest/src/mongodb.sh | 5 +- test_unstructured_ingest/src/notion.sh | 5 +- test_unstructured_ingest/src/onedrive.sh | 5 +- test_unstructured_ingest/src/opensearch.sh | 5 +- test_unstructured_ingest/src/outlook.sh | 5 +- .../src/pdf-fast-reprocess.sh | 5 +- .../src/s3-compression.sh | 5 +- test_unstructured_ingest/src/s3-minio.sh | 5 +- test_unstructured_ingest/src/s3.sh | 5 +- test_unstructured_ingest/src/salesforce.sh | 5 +- test_unstructured_ingest/src/sftp.sh | 5 +- .../src/sharepoint-with-permissions.sh | 5 +- test_unstructured_ingest/src/sharepoint.sh | 5 +- test_unstructured_ingest/src/slack.sh | 5 +- test_unstructured_ingest/src/wikipedia.sh | 5 +- test_unstructured_ingest/test-ingest-src.sh | 42 - unstructured/__version__.py | 2 +- 79 files changed, 761 insertions(+), 7760 deletions(-) delete mode 100644 test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json delete mode 100644 test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json delete mode 100644 test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 62ae488af3..ad5dea531f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.17.6-dev1 +## 0.17.6-dev2 ### Enhancements @@ -9,6 +9,8 @@ Two executions of the same code, on the same file, produce different results. The order of the elements is random. This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) +- Resolve open CVEs + ## 0.17.5 diff --git a/requirements/base.txt b/requirements/base.txt index 78fc8ce871..862ed52ff9 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -8,9 +8,9 @@ anyio==4.9.0 # via httpx backoff==2.2.1 # via -r ./base.in -beautifulsoup4==4.13.3 +beautifulsoup4==4.13.4 # via -r ./base.in -certifi==2025.1.31 +certifi==2025.4.26 # via # httpcore # httpx @@ -42,11 +42,11 @@ exceptiongroup==1.2.2 # via anyio filetype==1.2.0 # via -r ./base.in -h11==0.14.0 +h11==0.16.0 # via httpcore html5lib==1.1 # via -r ./base.in -httpcore==1.0.7 +httpcore==1.0.9 # via httpx httpx==0.28.1 # via unstructured-client @@ -62,13 +62,13 @@ jsonpath-python==1.0.6 # via unstructured-client langdetect==1.0.9 # via -r ./base.in -lxml==5.3.1 +lxml==5.4.0 # via -r ./base.in marshmallow==3.26.1 # via # dataclasses-json # unstructured-client -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 # via # typing-inspect # unstructured-client @@ -80,9 +80,9 @@ numpy==2.0.2 # via -r ./base.in olefile==0.47 # via python-oxmsg -orderly-set==5.3.0 +orderly-set==5.4.0 # via deepdiff -packaging==24.2 +packaging==25.0 # via # marshmallow # unstructured-client @@ -100,7 +100,7 @@ python-magic==0.4.27 # via -r ./base.in python-oxmsg==0.0.2 # via -r ./base.in -rapidfuzz==3.12.2 +rapidfuzz==3.13.0 # via -r ./base.in regex==2024.11.6 # via nltk @@ -119,13 +119,13 @@ six==1.17.0 # unstructured-client sniffio==1.3.1 # via anyio -soupsieve==2.6 +soupsieve==2.7 # via beautifulsoup4 tqdm==4.67.1 # via # -r ./base.in # nltk -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -r ./base.in # anyio diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index be1d0c40fd..9659e8bac1 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -22,3 +22,5 @@ importlib-metadata>=8.5.0 unstructured-client>=0.23.0,<0.26.0 # paddle constrains protobuf; maybe we should put paddle here since its version is pinned in .in file protobuf>=6.30.0 +# (yao) issues with pdfminer-six above 20250416 +pdfminer.six<20250416 \ No newline at end of file diff --git a/requirements/dev.txt b/requirements/dev.txt index 4b489656fb..b42ff70e01 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -17,7 +17,7 @@ distlib==0.3.9 # via virtualenv filelock==3.18.0 # via virtualenv -identify==2.6.9 +identify==2.6.10 # via pre-commit importlib-metadata==8.6.1 # via @@ -25,7 +25,7 @@ importlib-metadata==8.6.1 # build nodeenv==1.9.1 # via pre-commit -packaging==24.2 +packaging==25.0 # via # -c ./base.txt # -c ./test.txt @@ -49,7 +49,7 @@ tomli==2.2.1 # -c ./test.txt # build # pip-tools -virtualenv==20.29.3 +virtualenv==20.30.0 # via pre-commit wheel==0.45.1 # via pip-tools diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index b6a9158f4f..f31b78b82a 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -4,13 +4,13 @@ # # pip-compile ./extra-docx.in # -lxml==5.3.1 +lxml==5.4.0 # via # -c ./base.txt # python-docx python-docx==1.1.2 # via -r ./extra-docx.in -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 9d0a14da55..2311bce60f 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -8,7 +8,7 @@ importlib-metadata==8.6.1 # via # -c ././deps/constraints.txt # markdown -markdown==3.7 +markdown==3.8 # via -r ./extra-markdown.in zipp==3.21.0 # via importlib-metadata diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index fa8e746301..ced65cd542 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-odt.in # -lxml==5.3.1 +lxml==5.4.0 # via # -c ./base.txt # python-docx @@ -12,7 +12,7 @@ pypandoc==1.15 # via -r ./extra-odt.in python-docx==1.1.2 # via -r ./extra-odt.in -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # python-docx diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 84afee5161..df43fc8f9b 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -18,11 +18,11 @@ anyio==4.9.0 # httpx astor==0.8.1 # via paddlepaddle -beautifulsoup4==4.13.3 +beautifulsoup4==4.13.4 # via # -c ./base.txt # unstructured-paddleocr -certifi==2025.1.31 +certifi==2025.4.26 # via # -c ./base.txt # httpcore @@ -44,13 +44,13 @@ exceptiongroup==1.2.2 # anyio fire==0.7.0 # via unstructured-paddleocr -fonttools==4.56.0 +fonttools==4.57.0 # via unstructured-paddleocr -h11==0.14.0 +h11==0.16.0 # via # -c ./base.txt # httpcore -httpcore==1.0.7 +httpcore==1.0.9 # via # -c ./base.txt # httpx @@ -68,7 +68,7 @@ imageio==2.37.0 # via scikit-image lazy-loader==0.4 # via scikit-image -lxml==5.3.1 +lxml==5.4.0 # via # -c ./base.txt # python-docx @@ -102,14 +102,14 @@ opencv-python-headless==4.11.0.86 # albumentations opt-einsum==3.3.0 # via paddlepaddle -packaging==24.2 +packaging==25.0 # via # -c ./base.txt # lazy-loader # scikit-image paddlepaddle==3.0.0 # via -r ./extra-paddleocr.in -pillow==11.1.0 +pillow==11.2.1 # via # imageio # paddlepaddle @@ -121,9 +121,9 @@ protobuf==6.30.2 # paddlepaddle pyclipper==1.3.0.post6 # via unstructured-paddleocr -pydantic==2.10.6 +pydantic==2.11.3 # via albumentations -pydantic-core==2.27.2 +pydantic-core==2.33.1 # via pydantic python-docx==1.1.2 # via unstructured-paddleocr @@ -131,7 +131,7 @@ pyyaml==6.0.2 # via # albumentations # unstructured-paddleocr -rapidfuzz==3.12.2 +rapidfuzz==3.13.0 # via # -c ./base.txt # unstructured-paddleocr @@ -153,13 +153,13 @@ sniffio==1.3.1 # via # -c ./base.txt # anyio -soupsieve==2.6 +soupsieve==2.7 # via # -c ./base.txt # beautifulsoup4 -stringzilla==3.12.3 +stringzilla==3.12.5 # via albucore -termcolor==2.5.0 +termcolor==3.0.1 # via fire tifffile==2024.8.30 # via scikit-image @@ -167,7 +167,7 @@ tqdm==4.67.1 # via # -c ./base.txt # unstructured-paddleocr -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # albucore @@ -178,6 +178,9 @@ typing-extensions==4.13.0 # pydantic # pydantic-core # python-docx + # typing-inspection +typing-inspection==0.4.0 + # via pydantic unstructured-paddleocr==2.10.0 # via -r ./extra-paddleocr.in urllib3==1.26.20 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 061fb6de3b..367924c7d6 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -8,7 +8,7 @@ antlr4-python3-runtime==4.9.3 # via omegaconf cachetools==5.5.2 # via google-auth -certifi==2025.1.31 +certifi==2025.4.26 # via # -c ./base.txt # requests @@ -42,21 +42,21 @@ filelock==3.18.0 # transformers flatbuffers==25.2.10 # via onnxruntime -fonttools==4.56.0 +fonttools==4.57.0 # via matplotlib -fsspec==2025.3.0 +fsspec==2025.3.2 # via # huggingface-hub # torch google-api-core[grpc]==2.24.2 # via google-cloud-vision -google-auth==2.38.0 +google-auth==2.39.0 # via # google-api-core # google-cloud-vision google-cloud-vision==3.10.1 # via -r ./extra-pdf-image.in -googleapis-common-protos==1.69.2 +googleapis-common-protos==1.70.0 # via # google-api-core # grpcio-status @@ -67,7 +67,7 @@ grpcio==1.71.0 # grpcio-status grpcio-status==1.62.3 # via google-api-core -huggingface-hub==0.29.3 +huggingface-hub==0.30.2 # via # timm # tokenizers @@ -85,7 +85,7 @@ jinja2==3.1.6 # via torch kiwisolver==1.4.7 # via matplotlib -lxml==5.3.1 +lxml==5.4.0 # via # -c ./base.txt # pikepdf @@ -125,7 +125,7 @@ onnxruntime==1.19.2 # unstructured-inference opencv-python==4.11.0.86 # via unstructured-inference -packaging==24.2 +packaging==25.0 # via # -c ./base.txt # huggingface-hub @@ -138,15 +138,16 @@ pandas==2.2.3 # via unstructured-inference pdf2image==1.17.0 # via -r ./extra-pdf-image.in -pdfminer-six==20240706 +pdfminer-six==20250327 # via + # -c ././deps/constraints.txt # -r ./extra-pdf-image.in # unstructured-inference pi-heif==0.22.0 # via -r ./extra-pdf-image.in -pikepdf==9.5.2 +pikepdf==9.7.0 # via -r ./extra-pdf-image.in -pillow==11.1.0 +pillow==11.2.1 # via # matplotlib # pdf2image @@ -172,7 +173,7 @@ pyasn1==0.6.1 # via # pyasn1-modules # rsa -pyasn1-modules==0.4.1 +pyasn1-modules==0.4.2 # via google-auth pycocotools==2.0.8 # via effdet @@ -203,7 +204,7 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.12.2 +rapidfuzz==3.13.0 # via # -c ./base.txt # unstructured-inference @@ -217,7 +218,7 @@ requests==2.32.3 # google-api-core # huggingface-hub # transformers -rsa==4.9 +rsa==4.9.1 # via google-auth safetensors==0.5.3 # via @@ -229,7 +230,7 @@ six==1.17.0 # via # -c ./base.txt # python-dateutil -sympy==1.13.1 +sympy==1.13.3 # via # onnxruntime # torch @@ -241,13 +242,13 @@ tokenizers==0.21.1 # via # -c ././deps/constraints.txt # transformers -torch==2.6.0 +torch==2.7.0 # via # effdet # timm # torchvision # unstructured-inference -torchvision==0.21.0 +torchvision==0.22.0 # via # effdet # timm @@ -256,9 +257,9 @@ tqdm==4.67.1 # -c ./base.txt # huggingface-hub # transformers -transformers==4.50.1 +transformers==4.51.3 # via unstructured-inference -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # huggingface-hub diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 30e77d1ce7..7ec19718d8 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -4,13 +4,13 @@ # # pip-compile ./extra-pptx.in # -lxml==5.3.1 +lxml==5.4.0 # via python-pptx -pillow==11.1.0 +pillow==11.2.1 # via python-pptx python-pptx==1.0.2 # via -r ./extra-pptx.in -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via python-pptx -xlsxwriter==3.2.2 +xlsxwriter==3.2.3 # via python-pptx diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index f9e62f5266..a7c793c739 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -4,7 +4,7 @@ # # pip-compile ./huggingface.in # -certifi==2025.1.31 +certifi==2025.4.26 # via # -c ./base.txt # requests @@ -21,11 +21,11 @@ filelock==3.18.0 # huggingface-hub # torch # transformers -fsspec==2025.3.0 +fsspec==2025.3.2 # via # huggingface-hub # torch -huggingface-hub==0.29.3 +huggingface-hub==0.30.2 # via # tokenizers # transformers @@ -53,7 +53,7 @@ numpy==2.0.2 # via # -c ./base.txt # transformers -packaging==24.2 +packaging==25.0 # via # -c ./base.txt # huggingface-hub @@ -82,13 +82,13 @@ six==1.17.0 # via # -c ./base.txt # langdetect -sympy==1.13.1 +sympy==1.13.3 # via torch tokenizers==0.21.1 # via # -c ././deps/constraints.txt # transformers -torch==2.6.0 +torch==2.7.0 # via -r ./huggingface.in tqdm==4.67.1 # via @@ -96,9 +96,9 @@ tqdm==4.67.1 # huggingface-hub # sacremoses # transformers -transformers==4.50.1 +transformers==4.51.3 # via -r ./huggingface.in -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # huggingface-hub diff --git a/requirements/ingest/ingest.txt b/requirements/ingest/ingest.txt index 6c99d3cfcd..364f499029 100644 --- a/requirements/ingest/ingest.txt +++ b/requirements/ingest/ingest.txt @@ -1,4 +1,4 @@ -unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.1 +unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]>=0.2.1 s3fs>=2024.9.0 urllib3>=1.26.20 backoff>=2.2.1 diff --git a/requirements/test.txt b/requirements/test.txt index 1ebccc8953..2706ac725c 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -14,7 +14,7 @@ click==8.1.8 # via # -c ./base.txt # black -coverage[toml]==7.7.1 +coverage[toml]==7.8.0 # via # -r ./test.in # pytest-cov @@ -22,7 +22,7 @@ exceptiongroup==1.2.2 # via # -c ./base.txt # pytest -flake8==7.1.2 +flake8==7.2.0 # via # -r ./test.in # flake8-print @@ -42,12 +42,12 @@ mccabe==0.7.0 # via flake8 mypy==1.15.0 # via -r ./test.in -mypy-extensions==1.0.0 +mypy-extensions==1.1.0 # via # -c ./base.txt # black # mypy -packaging==24.2 +packaging==25.0 # via # -c ./base.txt # black @@ -58,15 +58,15 @@ platformdirs==4.3.7 # via black pluggy==1.5.0 # via pytest -pycodestyle==2.12.1 +pycodestyle==2.13.0 # via # flake8 # flake8-print -pydantic==2.10.6 +pydantic==2.11.3 # via -r ./test.in -pydantic-core==2.27.2 +pydantic-core==2.33.1 # via pydantic -pyflakes==3.2.0 +pyflakes==3.3.2 # via # autoflake # flake8 @@ -74,7 +74,7 @@ pytest==8.3.5 # via # pytest-cov # pytest-mock -pytest-cov==6.0.0 +pytest-cov==6.1.1 # via -r ./test.in pytest-mock==3.14.0 # via -r ./test.in @@ -82,7 +82,7 @@ python-dateutil==2.9.0.post0 # via # -c ./base.txt # freezegun -ruff==0.11.2 +ruff==0.11.7 # via -r ./test.in semantic-version==2.10.0 # via liccheck @@ -101,7 +101,7 @@ tomli==2.2.1 # pytest types-click==7.1.8 # via -r ./test.in -types-markdown==3.7.0.20250322 +types-markdown==3.8.0.20250415 # via -r ./test.in types-requests==2.31.0.6 # via -r ./test.in @@ -109,10 +109,13 @@ types-tabulate==0.9.0.20241207 # via -r ./test.in types-urllib3==1.26.25.14 # via types-requests -typing-extensions==4.13.0 +typing-extensions==4.13.2 # via # -c ./base.txt # black # mypy # pydantic # pydantic-core + # typing-inspection +typing-inspection==0.4.0 + # via pydantic diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index 06e6a90097..24c362f451 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -200,7 +200,7 @@ { "type": "ListItem", "element_id": "36eb8f3c3778fbb71dc056571e71175d", - "text": "4. Team science and scientific communication: \u201csoft\u201d skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.", + "text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -288,7 +288,7 @@ { "type": "NarrativeText", "element_id": "f250e86931949c66fe99d742fd9be29c", - "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM\u2019s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with", + "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with", "metadata": { "filetype": "application/pdf", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json index cca8a4dd1c..12255b00e7 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json @@ -178,7 +178,7 @@ { "type": "NarrativeText", "element_id": "0fb8eb24db1b27f6f8b69213e3dd9b41", - "text": "Long-term contracts. \u2014If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed.", + "text": "Long-term contracts. —If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -200,7 +200,7 @@ { "type": "NarrativeText", "element_id": "7282f497b067ed1e34176cc85d46ea8e", - "text": "Other methods.\u2014Unless the Service has published a regulation or procedure to the contrary, all other changes !n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of income attributable to the sale or furnishing of utility services no later than the year In which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes.", + "text": "Other methods.—Unless the Service has published a regulation or procedure to the contrary, all other changes !n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the Inclusion of income attributable to the sale or furnishing of utility services no later than the year In which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -310,7 +310,7 @@ { "type": "NarrativeText", "element_id": "9256e7591256b6799035172da259b839", - "text": "Uniform capitalization rules and limitation on cash method.\u2014If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\u201cAct\u201d), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required.", + "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under section,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Act”), the change 1s treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustrnents into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -332,7 +332,7 @@ { "type": "NarrativeText", "element_id": "9951e8eac8f909df08655f3bc100a586", - "text": "Disregard the instructions under Time and Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., \u201cAutomatic Change to Accrual Method\u2014Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information.", + "text": "Disregard the instructions under Time and Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., “Automatic Change to Accrual Method—Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -508,7 +508,7 @@ { "type": "NarrativeText", "element_id": "c92c7f4def0263141b370bf307d6bcc0", - "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of \u201cgood cause\u201d and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev, Proc. 79-63.", + "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of “good cause” and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev, Proc. 79-63.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -552,7 +552,7 @@ { "type": "NarrativeText", "element_id": "2932b94008de341f867fe6cfa1c95969", - "text": "Individuals.\u2014An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both.", + "text": "Individuals.—An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -574,7 +574,7 @@ { "type": "NarrativeText", "element_id": "4a9b9ec8ba60e739f49cfd240aa4439f", - "text": "Others.-\u2014The employer identification number of an applicant other than an individual should be entered in this block.", + "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -618,7 +618,7 @@ { "type": "NarrativeText", "element_id": "3c683355b205b83c4c0d3437e6cfa7e1", - "text": "Individuals. \u2014An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign.", + "text": "Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -640,7 +640,7 @@ { "type": "NarrativeText", "element_id": "e8cfd8f6db0442ba89ebad7a26a61fe9", - "text": "Partnerships.\u2014The form should be signed with the partnership name followed by the signature of one of the general partners and the words \u201cGeneral Partner.\u201d", + "text": "Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.”", "metadata": { "filetype": "application/pdf", "languages": [ @@ -662,7 +662,7 @@ { "type": "NarrativeText", "element_id": "28ac207401b182955c7f456e4ed569e7", - "text": "Corporations, cooperatives, and insurance companies.\u2014The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation.", + "text": "Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -684,7 +684,7 @@ { "type": "NarrativeText", "element_id": "0bb2ae65d2e8e2d6deafb8a0b8ca959e", - "text": "Fiduciaries.\u2014The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title.", + "text": "Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -706,7 +706,7 @@ { "type": "NarrativeText", "element_id": "76d50bc1b5843d10ec33f0dd669e0158", - "text": "Preparer other than partner, officer, etc.\u2014The signature of the individual preparing the application should appear in the space provided on page 6.", + "text": "Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -838,7 +838,7 @@ { "type": "NarrativeText", "element_id": "43c45bb43eaf69131bf2392df1239ef2", - "text": "Item 5a, page 1.\u2014\u201cTaxable income or (loss) from operations\u201d is to be entered before application of any net operating loss deduction under section 172(a).", + "text": "Item 5a, page 1.—“Taxable income or (loss) from operations” is to be entered before application of any net operating loss deduction under section 172(a).", "metadata": { "filetype": "application/pdf", "languages": [ @@ -860,7 +860,7 @@ { "type": "NarrativeText", "element_id": "be68edc9cf1c170006855414e15dcb72", - "text": "Item 6, page 2.\u2014The term \u201cgross receipts\u201d includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you are a resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority.", + "text": "Item 6, page 2.—The term “gross receipts” includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you are a resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legally imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -882,7 +882,7 @@ { "type": "NarrativeText", "element_id": "31b52f9f7ca8d75190858bf0d55805db", - "text": "Item 7b, page 2.\u2014If item 7b 1s \u201cYes,\u201d indicate on a separate sheet the following for each separate trade or business: Nature of business", + "text": "Item 7b, page 2.—If item 7b 1s “Yes,” indicate on a separate sheet the following for each separate trade or business: Nature of business", "metadata": { "filetype": "application/pdf", "languages": [ @@ -926,7 +926,7 @@ { "type": "NarrativeText", "element_id": "28d8006c1f48ce2aec42391c8318fc8a", - "text": "Item 11, page 2.\u2014If you cannot provide the requested information, you may sign a statement under penalties of perjury that:", + "text": "Item 11, page 2.—If you cannot provide the requested information, you may sign a statement under penalties of perjury that:", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1036,7 +1036,7 @@ { "type": "NarrativeText", "element_id": "e5f591cf708bf2cae8df5018db1f3b1e", - "text": "Item 13, page 2.\u2014Insert the actual number of tax years. Use of the term \u201csince inception\u201d 1s not acceptable. However, \u201cmore than 6 years\u201d Is acceptable.", + "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” 1s not acceptable. However, “more than 6 years” Is acceptable.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1080,7 +1080,7 @@ { "type": "NarrativeText", "element_id": "be4be7fe105304e8063250e9e8933b50", - "text": "Item 1b, page 2.\u2014Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application.", + "text": "Item 1b, page 2.—Include any amounts reported as income ina prior year although the income had not been accrued (earned) or received in the prior year; for example, discount on installment loans reported as income for the year in which the loans were made instead of for the year or years in which the income was received or earned. Advance payments under Rev. Proc. 71-21 or Regulations section 1.451-5 must be fully explained and all pertinent information must be submitted with this application.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1124,7 +1124,7 @@ { "type": "NarrativeText", "element_id": "ff41c26e5658894786749ca6449cff67", - "text": "Limitation on the Use of the Cash Method of Accounting. \u2014Except as provided below, C corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities.", + "text": "Limitation on the Use of the Cash Method of Accounting. —Except as provided below, C corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1146,7 +1146,7 @@ { "type": "NarrativeText", "element_id": "454de5bfbdcba4385a21dd6261c57d53", - "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to\u2014", + "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to—", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1168,7 +1168,7 @@ { "type": "NarrativeText", "element_id": "fc1f0d4d56acd27a18ba80ab0acfb9e9", - "text": "(1) Farming businesses.\u2014F or this purpose, the term \u201cfarming business\u201d 1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method.", + "text": "(1) Farming businesses.—F or this purpose, the term “farming business” 1s defined in section 263A(e)(4), but it also includes the raising, harvesting, or growing of trees to which section 263A(c)(5) applies. Notwithstanding this exception, section 447 requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1190,7 +1190,7 @@ { "type": "NarrativeText", "element_id": "51dcb59cd362d0003f609fdb43fbdfdc", - "text": "(2) Qualified personal service corporations. \u2014 A \u201cqualified personal service corporation\u201d is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law, engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)", + "text": "(2) Qualified personal service corporations. — A “qualified personal service corporation” is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law, engineering, architecture, accounting, actuarial science, performing arts, or consulting, and (b)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1256,7 +1256,7 @@ { "type": "NarrativeText", "element_id": "5f5c402f9ebefef3ba8eabf1b5f628b2", - "text": "(3) Entities with gross receipts of $5,000,000 or less. \u2014To qualify for this exception, the C corporation's or partnership\u2019s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period.", + "text": "(3) Entities with gross receipts of $5,000,000 or less. —To qualify for this exception, the C corporation's or partnership’s annual average gross receipts for the three years ending with the prior tax year may not exceed $5,000,000. If the corporation or partnership was not in existence for the entire 3-year period, the period of existence is used to determine whether the corporation or partnership qualifies. If any tax year in the 3-year period is a short tax year, the corporation or partnership must annualize the gross receipts by multiplying the gross receipts by 12 and dividing the result by the number of months in the short period.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1344,7 +1344,7 @@ { "type": "NarrativeText", "element_id": "70c06cbb13920b0a14d56b49c3e596eb", - "text": "Inventories of retail merchants.\u2014The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8.", + "text": "Inventories of retail merchants.—The retail method of pricing inventories does not contemplate valuation of goods at the retail selling price. The retail selling price of goods on hand must be reduced to approximate cost or cost or market, whichever Is lower, by the adjustments required in Regulations section 1.471-8.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1366,7 +1366,7 @@ { "type": "NarrativeText", "element_id": "73d59612ec830432b4de6df54516bd9c", - "text": "LIFO inventory changes.\u2014Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:", + "text": "LIFO inventory changes.—Attach a schedule with all the required computations when changing the method of figuring LIFO inventories. If you are changing from LIFO to a non-LIFO method, attach a schedule with the following additional information:", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1498,7 +1498,7 @@ { "type": "NarrativeText", "element_id": "0ea0c5159902dffd24b032afc223d32a", - "text": "% U.S. Government Printing Office: 1987\u2014201-993/60166", + "text": "% U.S. Government Printing Office: 1987—201-993/60166", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1542,7 +1542,7 @@ { "type": "NarrativeText", "element_id": "b4575fdaff52c4def8f166ed0e2c4b39", - "text": "Section 460(f) provides that the term \u201clong-term contract\u201d means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete.", + "text": "Section 460(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it 1s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: (1) a unique item not normally included in your finished goods inventory, or (2) any item that normally requires more than 12 calendar months to complete.", "metadata": { "filetype": "application/pdf", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json index 8709788128..d361b431ef 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json @@ -178,7 +178,7 @@ { "type": "NarrativeText", "element_id": "4af565181db0676202636585f9abb438", - "text": "Long-term contracts. \u2014If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed.", + "text": "Long-term contracts. —If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed.", "metadata": { "filetype": "image/png", "languages": [ @@ -200,7 +200,7 @@ { "type": "NarrativeText", "element_id": "8dc3e4d18b3936db176790654f8823e1", - "text": "Other methods.\u2014Unless the Service has published a regulation or procedure to the contrary, all other changes 1n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the inclusion of income attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes.", + "text": "Other methods.—Unless the Service has published a regulation or procedure to the contrary, all other changes 1n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: (1) the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section 805); (2) the repeal of the installment method for sales under a revolving credit plan (Act section 812); (3) the inclusion of income attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section 821); and (4) the repeal of the deduction for qualified discount coupons (Act section 823). Do not file Form 3115 for these changes.", "metadata": { "filetype": "image/png", "languages": [ @@ -310,7 +310,7 @@ { "type": "NarrativeText", "element_id": "5b2139cd0640cd4eceddbce416a17f6f", - "text": "Uniform capitalization rules and limitation on cash method.\u2014If you are required to change your method of accounting under sectior,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (\u201cAct\u201d), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required.", + "text": "Uniform capitalization rules and limitation on cash method.—If you are required to change your method of accounting under sectior,263A (relating to the capitalization and inclusion in inventory costs of certain expenses) or 448 (limiting the use of the cash method of accounting by certain taxpayers) as added by the Tax Reform Act of 1986 (“Act”), the change is treated as initiated by the taxpayer, approved by the Commissioner, and the period for taking the adjustments under section 481(a) into account will not exceed 4 years. (Hospitals required to change from the cash method under section 448 have 10 years to take the adjustments into account.) Complete Section A and the appropriate sections (B-1 or C and D) for which the change is required.", "metadata": { "filetype": "image/png", "languages": [ @@ -332,7 +332,7 @@ { "type": "NarrativeText", "element_id": "525b9d3bf3ae575f8e86f62af6068ebd", - "text": "Disregard the instructions under Time and Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., \u201cAutomatic Change to Accrual Method Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information.", + "text": "Disregard the instructions under Time and Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and the basis for that conclusion. Identify the automatic change being made at the top of page 1 of Form 3115 (e.g., “Automatic Change to Accrual Method Section 448\"). See Temporary Regulations sections 1.263A-1T and 1.448-1T for additional information.", "metadata": { "filetype": "image/png", "languages": [ @@ -508,7 +508,7 @@ { "type": "NarrativeText", "element_id": "53204b2c819131895da7dba7fe978047", - "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of \u201cgood cause\" and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev. Proc. 79-63.", + "text": "If your application is filed after the 180-day period, it is late. The application will be considered for processing only upon a showing of “good cause\" and if it can be shown to the satisfaction of the Commissioner that granting you an extension will not jeopardize the Government's interests. For further information, see Rev. Proc. 79-63.", "metadata": { "filetype": "image/png", "languages": [ @@ -552,7 +552,7 @@ { "type": "NarrativeText", "element_id": "a41365af6ab3185637e8f3891b27fcba", - "text": "Individuals.\u2014An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both.", + "text": "Individuals.—An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both.", "metadata": { "filetype": "image/png", "languages": [ @@ -574,7 +574,7 @@ { "type": "NarrativeText", "element_id": "803549fa9207cd4111ed9e5d7389a027", - "text": "Others.-\u2014The employer identification number of an applicant other than an individual should be entered in this block.", + "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block.", "metadata": { "filetype": "image/png", "languages": [ @@ -618,7 +618,7 @@ { "type": "NarrativeText", "element_id": "f49752a38f790a75872b43214d7b8e0c", - "text": "Individuals. \u2014An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign.", + "text": "Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign.", "metadata": { "filetype": "image/png", "languages": [ @@ -640,7 +640,7 @@ { "type": "NarrativeText", "element_id": "162bb7ebc5019059dc8341f5c44da7ec", - "text": "Partnerships.\u2014The form should be signed with the partnership name followed by the signature of one of the general partners and the words \u201cGeneral Partner.\u201d", + "text": "Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.”", "metadata": { "filetype": "image/png", "languages": [ @@ -662,7 +662,7 @@ { "type": "NarrativeText", "element_id": "ba5311e456328d16efd5d2f5a8500388", - "text": "Corporations, cooperatives, and insurance companies.\u2014The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation.", + "text": "Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation.", "metadata": { "filetype": "image/png", "languages": [ @@ -684,7 +684,7 @@ { "type": "NarrativeText", "element_id": "6fe312aeeb0d718a776c177b27265353", - "text": "Fiduciaries.\u2014The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title.", + "text": "Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title.", "metadata": { "filetype": "image/png", "languages": [ @@ -706,7 +706,7 @@ { "type": "NarrativeText", "element_id": "152f56dcf3866eaa539ba72ac8d75fb9", - "text": "Preparer other than partner, officer, etc.\u2014The signature of the individual preparing the application should appear in the space provided on page 6.", + "text": "Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6.", "metadata": { "filetype": "image/png", "languages": [ @@ -838,7 +838,7 @@ { "type": "NarrativeText", "element_id": "ce36a381c0fb31df90d3d701b9b5ee2a", - "text": "Item 5a, page 1.\u2014\u201cTaxable income or (loss) from operations\u201d is to be entered before application of any net operating loss deduction under section 172(a).", + "text": "Item 5a, page 1.—“Taxable income or (loss) from operations” is to be entered before application of any net operating loss deduction under section 172(a).", "metadata": { "filetype": "image/png", "languages": [ @@ -860,7 +860,7 @@ { "type": "NarrativeText", "element_id": "f7876eba5d8a77571828d215aab6bf34", - "text": "Item 6, page 2.\u2014The term \u201cgross receipts\u201d includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you area resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legatly imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority.", + "text": "Item 6, page 2.—The term “gross receipts” includes total sales (net of returns and allowances) and all amounts received for services. In addition, gross receipts include any income from investments and from incidental or outside sources (e.g., interest, dividends, rents, royalties, and annuities). However, if you area resaler of personal property, exclude from gross receipts any amounts not derived in the ordinary course of a trade or business. Gross receipts do not include amounts received for sales taxes if, under the applicable state or local law, the tax is legatly imposed on the purchaser of the good or service, and the taxpayer merely collects and remits the tax to the taxing authority.", "metadata": { "filetype": "image/png", "languages": [ @@ -882,7 +882,7 @@ { "type": "NarrativeText", "element_id": "baf5040c1ebd03c23f1210ec383970db", - "text": "Item 7b, page 2.\u2014If item 7b 1s \u201cYes,\u201d indicate ona separate sheet the following for each separate trade or business: Nature of business", + "text": "Item 7b, page 2.—If item 7b 1s “Yes,” indicate ona separate sheet the following for each separate trade or business: Nature of business", "metadata": { "filetype": "image/png", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json index 494e24e546..b891a7af79 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json +++ b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json @@ -779,7 +779,7 @@ { "type": "NarrativeText", "element_id": "c86708b570205221afc715f7f6a4ca3f", - "text": "

    News Around NOAA

    National Program

    Are You Weather-Ready for the Spring?

    Weather.gov > News Around NOAA > Are You Weather-Ready for the Spring?
    ", + "text": "

    News Around NOAA

    National Program

    Are You Weather-Ready for the Spring?

    Weather.gov > News Around NOAA > Are You Weather-Ready for the Spring?
    ", "metadata": { "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json index a3e498de8c..cc6ecebd11 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json @@ -2,7 +2,7 @@ { "type": "CompositeElement", "element_id": "85002882dd396da0b1b82c925b002be5", - "text": "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 \u2013 INTRODUCTION\n\nA. PURPOSE", + "text": "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 – INTRODUCTION\n\nA. PURPOSE", "metadata": { "data_source": { "record_locator": { @@ -56,7 +56,7 @@ { "type": "CompositeElement", "element_id": "1abe685eb8dfed0f2266d6cf793d7e6b", - "text": "le 11 of the United States Code. 28 U.S.C. \u00a7 586(b). The Handbook, issued as part of our duties under 28 U.S.C. \u00a7 586, establishes or clarifies the", + "text": "le 11 of the United States Code. 28 U.S.C. § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586, establishes or clarifies the", "metadata": { "data_source": { "record_locator": { @@ -152,7 +152,7 @@ { "type": "CompositeElement", "element_id": "b7d1b42646393ca0f41af0e8ec48f9a9", - "text": "relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11 U.S.C. \u00a7 321,", + "text": "relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11 U.S.C. § 321,", "metadata": { "data_source": { "record_locator": { @@ -176,7 +176,7 @@ { "type": "CompositeElement", "element_id": "9ee33f4141eca1f98ca4299d0fdfba31", - "text": "w. 11 U.S.C. \u00a7 321, 28 U.S.C. \u00a7 586, 28 C.F.R. \u00a7 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in this Handbook but", + "text": "w. 11 U.S.C. § 321, 28 U.S.C. § 586, 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in this Handbook but", "metadata": { "data_source": { "record_locator": { @@ -319,7 +319,7 @@ { "type": "CompositeElement", "element_id": "f4412be8c7b2624c729af85c85b3a0e4", - "text": "es in this Handbook refer to the Bankruptcy Code, 11 U.S.C. \u00a7 101 et seq., unless otherwise indicated.", + "text": "es in this Handbook refer to the Bankruptcy Code, 11 U.S.C. § 101 et seq., unless otherwise indicated.", "metadata": { "data_source": { "record_locator": { @@ -531,7 +531,7 @@ { "type": "CompositeElement", "element_id": "24e1076110b431b248b43b1fdaae5282", - "text": "apter 13 trustees./ This Handbook is issued under the authority of the Program\u2019s enabling statutes.", + "text": "apter 13 trustees./ This Handbook is issued under the authority of the Program’s enabling statutes.", "metadata": { "data_source": { "record_locator": { @@ -625,7 +625,7 @@ { "type": "CompositeElement", "element_id": "db297530e558410b89acd93c6b452b84", - "text": "perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28 U.S.C. \u00a7", + "text": "perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28 U.S.C. §", "metadata": { "data_source": { "record_locator": { @@ -649,7 +649,7 @@ { "type": "CompositeElement", "element_id": "201bfacc211f0eb640e2830b8c29ae41", - "text": "rustee. 28 U.S.C. \u00a7 586(b), 28 C.F.R. \u00a7 58.4(b) referencing 28 C.F.R. \u00a7 58.3(b).", + "text": "rustee. 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b).", "metadata": { "data_source": { "record_locator": { @@ -673,7 +673,7 @@ { "type": "CompositeElement", "element_id": "eff9d6f3a0cdb968b7715e2e417e12ea", - "text": "Although this Handbook is not intended to be a complete statutory reference, the standing trustee\u2019s primary statutory duties are set forth in 11", + "text": "Although this Handbook is not intended to be a complete statutory reference, the standing trustee’s primary statutory duties are set forth in 11", "metadata": { "data_source": { "record_locator": { @@ -696,7 +696,7 @@ { "type": "CompositeElement", "element_id": "fd4c45036e8f17c27271f75944389724", - "text": "are set forth in 11 U.S.C. \u00a7 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. \u00a7 704. These duties", + "text": "are set forth in 11 U.S.C. § 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. § 704. These duties", "metadata": { "data_source": { "record_locator": { @@ -720,7 +720,7 @@ { "type": "CompositeElement", "element_id": "a968d741409111b777fc123ef01f5407", - "text": "\u00a7 704. These duties include, but are not limited to, the following:", + "text": "§ 704. These duties include, but are not limited to, the following:", "metadata": { "data_source": { "record_locator": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json index b6516f791c..829b9b7a7e 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json @@ -2,7 +2,7 @@ { "type": "CompositeElement", "element_id": "06c85506db46c8d0e4f014e75146bcfc", - "text": "0 2 0 2\n\np e S 0 3\n\n] L C . s c [\n\n3 v 6 0 9 4 0 . 4 0 0 2 : v i X r a\n\nDense Passage Retrieval for Open-Domain Question Answering\n\nVladimir Karpukhin\u2217, Barlas O\u02d8guz\u2217, Sewon Min\u2020, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen\u2021, Wen-tau Yih\n\nFacebook AI\n\n\u2020University of Washington\n\n\u2021Princeton University\n\n{vladk, barlaso, plewis, ledell, edunov, scottyih}@fb.com sewon@cs.washington.edu danqic@cs.princeton.edu\n\nAbstract\n\nOpen-domain question answering relies on ef- \ufb01cient passage retrieval to select candidate contexts, where traditional sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can be practically implemented us- ing dense representations alone, where em- beddings are learned from a small number of questions and passages by a simple dual- encoder framework. When evaluated on a wide range of open-domain QA datasets, our dense retriever outperforms a strong Lucene- BM25 system greatly by 9%-19% absolute in terms of top-20 passage retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA benchmarks.1\n\n1", + "text": "0 2 0 2\n\np e S 0 3\n\n] L C . s c [\n\n3 v 6 0 9 4 0 . 4 0 0 2 : v i X r a\n\nDense Passage Retrieval for Open-Domain Question Answering\n\nVladimir Karpukhin∗, Barlas O˘guz∗, Sewon Min†, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen‡, Wen-tau Yih\n\nFacebook AI\n\n†University of Washington\n\n‡Princeton University\n\n{vladk, barlaso, plewis, ledell, edunov, scottyih}@fb.com sewon@cs.washington.edu danqic@cs.princeton.edu\n\nAbstract\n\nOpen-domain question answering relies on ef- ficient passage retrieval to select candidate contexts, where traditional sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can be practically implemented us- ing dense representations alone, where em- beddings are learned from a small number of questions and passages by a simple dual- encoder framework. When evaluated on a wide range of open-domain QA datasets, our dense retriever outperforms a strong Lucene- BM25 system greatly by 9%-19% absolute in terms of top-20 passage retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA benchmarks.1\n\n1", "metadata": { "data_source": { "record_locator": { @@ -24,7 +24,7 @@ { "type": "CompositeElement", "element_id": "3ef998ac1d905d8ff1016f96a243295c", - "text": "Introduction\n\nOpen-domain question answering (QA) (Voorhees, 1999) is a task that answers factoid questions us- ing a large collection of documents. While early QA systems are often complicated and consist of multiple components (Ferrucci (2012); Moldovan et al. (2003), inter alia), the advances of reading comprehension models suggest a much simpli\ufb01ed two-stage framework: (1) a context retriever \ufb01rst selects a small subset of passages where some of them contain the answer to the question, and then (2) a machine reader can thoroughly exam- ine the retrieved contexts and identify the correct answer (Chen et al., 2017). Although reducing open-domain QA to machine reading is a very rea- sonable strategy, a huge performance degradation is often observed in practice2, indicating the needs of improving retrieval.\n\n\u2217Equal contribution 1The code and trained models have been released at\n\nhttps://github.com/facebookresearch/DPR.", + "text": "Introduction\n\nOpen-domain question answering (QA) (Voorhees, 1999) is a task that answers factoid questions us- ing a large collection of documents. While early QA systems are often complicated and consist of multiple components (Ferrucci (2012); Moldovan et al. (2003), inter alia), the advances of reading comprehension models suggest a much simplified two-stage framework: (1) a context retriever first selects a small subset of passages where some of them contain the answer to the question, and then (2) a machine reader can thoroughly exam- ine the retrieved contexts and identify the correct answer (Chen et al., 2017). Although reducing open-domain QA to machine reading is a very rea- sonable strategy, a huge performance degradation is often observed in practice2, indicating the needs of improving retrieval.\n\n∗Equal contribution 1The code and trained models have been released at\n\nhttps://github.com/facebookresearch/DPR.", "metadata": { "data_source": { "record_locator": { @@ -46,7 +46,7 @@ { "type": "CompositeElement", "element_id": "71b12f58c99f6097b17f4d5b6147201b", - "text": "2For instance, the exact match score on SQuAD v1.1 drops\n\nRetrieval in open-domain QA is usually imple- mented using TF-IDF or BM25 (Robertson and Zaragoza, 2009), which matches keywords ef\ufb01- ciently with an inverted index and can be seen as representing the question and context in high- dimensional, sparse vectors (with weighting). Con- versely, the dense, latent semantic encoding is com- plementary to sparse representations by design. For example, synonyms or paraphrases that consist of completely different tokens may still be mapped to vectors close to each other. Consider the question \u201cWho is the bad guy in lord of the rings?\u201d, which can be answered from the context \u201cSala Baker is best known for portraying the villain Sauron in the Lord of the Rings trilogy.\u201d A term-based system would have dif\ufb01culty retrieving such a context, while a dense retrieval system would be able to better match \u201cbad guy\u201d with \u201cvillain\u201d and fetch the cor- rect context. Dense encodings are also learnable by adjusting the embedding functions, which pro- vides additional \ufb02exibility to have a task-speci\ufb01c representation. With special in-memory data struc- tures and indexing schemes, retrieval can be done ef\ufb01ciently using maximum inner product search (MIPS) algorithms (e.g., Shrivastava and Li (2014); Guo et al. (2016)).\n\nHowever, it is generally believed that learn- ing a good dense vector representation needs a large number of labeled pairs of question and con- texts. Dense retrieval methods have thus never be shown to outperform TF-IDF/BM25 for open- domain QA before ORQA (Lee et al., 2019), which proposes a sophisticated inverse cloze task (ICT) objective, predicting the blocks that contain the masked sentence, for additional pretraining. The question encoder and the reader model are then \ufb01ne- tuned using pairs of questions and answers jointly. Although ORQA successfully demonstrates that dense retrieval can outperform BM25, setting new state-of-the-art results on multiple open-domain", + "text": "2For instance, the exact match score on SQuAD v1.1 drops\n\nRetrieval in open-domain QA is usually imple- mented using TF-IDF or BM25 (Robertson and Zaragoza, 2009), which matches keywords effi- ciently with an inverted index and can be seen as representing the question and context in high- dimensional, sparse vectors (with weighting). Con- versely, the dense, latent semantic encoding is com- plementary to sparse representations by design. For example, synonyms or paraphrases that consist of completely different tokens may still be mapped to vectors close to each other. Consider the question “Who is the bad guy in lord of the rings?”, which can be answered from the context “Sala Baker is best known for portraying the villain Sauron in the Lord of the Rings trilogy.” A term-based system would have difficulty retrieving such a context, while a dense retrieval system would be able to better match “bad guy” with “villain” and fetch the cor- rect context. Dense encodings are also learnable by adjusting the embedding functions, which pro- vides additional flexibility to have a task-specific representation. With special in-memory data struc- tures and indexing schemes, retrieval can be done efficiently using maximum inner product search (MIPS) algorithms (e.g., Shrivastava and Li (2014); Guo et al. (2016)).\n\nHowever, it is generally believed that learn- ing a good dense vector representation needs a large number of labeled pairs of question and con- texts. Dense retrieval methods have thus never be shown to outperform TF-IDF/BM25 for open- domain QA before ORQA (Lee et al., 2019), which proposes a sophisticated inverse cloze task (ICT) objective, predicting the blocks that contain the masked sentence, for additional pretraining. The question encoder and the reader model are then fine- tuned using pairs of questions and answers jointly. Although ORQA successfully demonstrates that dense retrieval can outperform BM25, setting new state-of-the-art results on multiple open-domain", "metadata": { "data_source": { "record_locator": { @@ -68,7 +68,7 @@ { "type": "CompositeElement", "element_id": "ef458b0b4659bfd57b11fbfb571c38d1", - "text": "from above 80% to less than 40% (Yang et al., 2019a).\n\nQA datasets, it also suffers from two weaknesses. First, ICT pretraining is computationally intensive and it is not completely clear that regular sentences are good surrogates of questions in the objective function. Second, because the context encoder is not \ufb01ne-tuned using pairs of questions and answers, the corresponding representations could be subop- timal.", + "text": "from above 80% to less than 40% (Yang et al., 2019a).\n\nQA datasets, it also suffers from two weaknesses. First, ICT pretraining is computationally intensive and it is not completely clear that regular sentences are good surrogates of questions in the objective function. Second, because the context encoder is not fine-tuned using pairs of questions and answers, the corresponding representations could be subop- timal.", "metadata": { "data_source": { "record_locator": { @@ -90,7 +90,7 @@ { "type": "CompositeElement", "element_id": "4204154eefaa843f79edc96dcc208054", - "text": "In this paper, we address the question: can we train a better dense embedding model using only pairs of questions and passages (or answers), with- out additional pretraining? By leveraging the now standard BERT pretrained model (Devlin et al., 2019) and a dual-encoder architecture (Bromley et al., 1994), we focus on developing the right training scheme using a relatively small number of question and passage pairs. Through a series of careful ablation studies, our \ufb01nal solution is surprisingly simple: the embedding is optimized for maximizing inner products of the question and relevant passage vectors, with an objective compar- ing all pairs of questions and passages in a batch. Our Dense Passage Retriever (DPR) is exception- ally strong. It not only outperforms BM25 by a large margin (65.2% vs. 42.9% in Top-5 accuracy), but also results in a substantial improvement on the end-to-end QA accuracy compared to ORQA (41.5% vs. 33.3%) in the open Natural Questions setting (Lee et al., 2019; Kwiatkowski et al., 2019). Our contributions are twofold. First, we demon- strate that with the proper training setup, sim- ply \ufb01ne-tuning the question and passage encoders on existing question-passage pairs is suf\ufb01cient to greatly outperform BM25. Our empirical results also suggest that additional pretraining may not be needed. Second, we verify that, in the context of open-domain question answering, a higher retrieval precision indeed translates to a higher end-to-end QA accuracy. By applying a modern reader model to the top retrieved passages, we achieve compara- ble or better results on multiple QA datasets in the open-retrieval setting, compared to several, much complicated systems.", + "text": "In this paper, we address the question: can we train a better dense embedding model using only pairs of questions and passages (or answers), with- out additional pretraining? By leveraging the now standard BERT pretrained model (Devlin et al., 2019) and a dual-encoder architecture (Bromley et al., 1994), we focus on developing the right training scheme using a relatively small number of question and passage pairs. Through a series of careful ablation studies, our final solution is surprisingly simple: the embedding is optimized for maximizing inner products of the question and relevant passage vectors, with an objective compar- ing all pairs of questions and passages in a batch. Our Dense Passage Retriever (DPR) is exception- ally strong. It not only outperforms BM25 by a large margin (65.2% vs. 42.9% in Top-5 accuracy), but also results in a substantial improvement on the end-to-end QA accuracy compared to ORQA (41.5% vs. 33.3%) in the open Natural Questions setting (Lee et al., 2019; Kwiatkowski et al., 2019). Our contributions are twofold. First, we demon- strate that with the proper training setup, sim- ply fine-tuning the question and passage encoders on existing question-passage pairs is sufficient to greatly outperform BM25. Our empirical results also suggest that additional pretraining may not be needed. Second, we verify that, in the context of open-domain question answering, a higher retrieval precision indeed translates to a higher end-to-end QA accuracy. By applying a modern reader model to the top retrieved passages, we achieve compara- ble or better results on multiple QA datasets in the open-retrieval setting, compared to several, much complicated systems.", "metadata": { "data_source": { "record_locator": { @@ -112,7 +112,7 @@ { "type": "CompositeElement", "element_id": "e6dee1abec28f8ff365ab6275b3e5f0e", - "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as \u201cWho \ufb01rst voiced Meg on Family Guy?\u201d or \u201cWhere was the 8th Dalai Lama born?\u201d, a system is required to answer it using a large corpus of diversi\ufb01ed topics. More speci\ufb01cally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,\u00b7\u00b7\u00b7 ,dD. We \ufb01rst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,\u00b7\u00b7\u00b7 ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to \ufb01nd a span w(i) s+1,\u00b7\u00b7\u00b7 ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an ef\ufb01cient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) \u2192 CF is a function that takes as input a question q and a corpus C and returns a much smaller \ufb01lter set of texts CF \u2282 C, where |CF| = k (cid:28) |C|. For a \ufb01xed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", + "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who first voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversified topics. More specifically, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We first split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to find a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efficient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller filter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a fixed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne", "metadata": { "data_source": { "record_locator": { @@ -134,7 +134,7 @@ { "type": "CompositeElement", "element_id": "ac6733a570cbdd5c8d48f8252b345b17", - "text": "3 Dense Passage Retriever (DPR)\n\nWe focus our research in this work on improv- ing the retrieval component in open-domain QA. Given a collection of M text passages, the goal of our dense passage retriever (DPR) is to index all the passages in a low-dimensional and continuous space, such that it can retrieve ef\ufb01ciently the top k passages relevant to the input question for the reader at run-time. Note that M can be very large (e.g., 21 million passages in our experiments, de- scribed in Section 4.1) and k is usually small, such as 20\u2013100.\n\n3.1 Overview\n\nOur dense passage retriever (DPR) uses a dense encoder EP(\u00b7) which maps any text passage to a d- dimensional real-valued vectors and builds an index for all the M passages that we will use for retrieval.\n\n3The ideal size and boundary of a text passage are func- tions of both the retriever and reader. We also experimented with natural paragraphs in our preliminary trials and found that using \ufb01xed-length passages performs better in both retrieval and \ufb01nal QA accuracy, as observed by Wang et al. (2019).\n\n4Exceptions include (Seo et al., 2019) and (Roberts et al., 2020), which retrieves and generates the answers, respectively.", + "text": "3 Dense Passage Retriever (DPR)\n\nWe focus our research in this work on improv- ing the retrieval component in open-domain QA. Given a collection of M text passages, the goal of our dense passage retriever (DPR) is to index all the passages in a low-dimensional and continuous space, such that it can retrieve efficiently the top k passages relevant to the input question for the reader at run-time. Note that M can be very large (e.g., 21 million passages in our experiments, de- scribed in Section 4.1) and k is usually small, such as 20–100.\n\n3.1 Overview\n\nOur dense passage retriever (DPR) uses a dense encoder EP(·) which maps any text passage to a d- dimensional real-valued vectors and builds an index for all the M passages that we will use for retrieval.\n\n3The ideal size and boundary of a text passage are func- tions of both the retriever and reader. We also experimented with natural paragraphs in our preliminary trials and found that using fixed-length passages performs better in both retrieval and final QA accuracy, as observed by Wang et al. (2019).\n\n4Exceptions include (Seo et al., 2019) and (Roberts et al., 2020), which retrieves and generates the answers, respectively.", "metadata": { "data_source": { "record_locator": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json index 93a7b96213..27105cb789 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-encoding/fake-html-cp1252.html.json @@ -74,7 +74,7 @@ { "type": "NarrativeText", "element_id": "c96f2c02e05225ffa09b7b93c303c323", - "text": " \u00a1\t\u00a2\t\u00a3\t\u00a4\t\u00a5\t\u00a6\t\u00a7\t\u00a8\t\u00a9\t\u00aa\t\u00ab\t\u00ac\tSHY\t\u00ae\t\u00af\n\u00b0\t\u00b1\t\u00b2\t\u00b3\t\u00b4\t\u00b5\t\u00b6\t\u00b7\t\u00b8\t\u00b9\t\u00ba\t\u00bb\t\u00bc\t\u00bd\t\u00be\t\u00bf\n\u00c0\t\u00c1\t\u00c2\t\u00c3\t\u00c4\t\u00c5\t\u00c6\t\u00c7\t\u00c8\t\u00c9\t\u00ca\t\u00cb\t\u00cc\t\u00cd\t\u00ce\t\u00cf\n\u00d0\t\u00d1\t\u00d2\t\u00d3\t\u00d4\t\u00d5\t\u00d6\t\u00d7\t\u00d8\t\u00d9\t\u00da\t\u00db\t\u00dc\t\u00dd\t\u00de\t\u00df\n\u00e0\t\u00e1\t\u00e2\t\u00e3\t\u00e4\t\u00e5\t\u00e6\t\u00e7\t\u00e8\t\u00e9\t\u00ea\t\u00eb\t\u00ec\t\u00ed\t\u00ee\t\u00ef\n\u00f0\t\u00f1\t\u00f2\t\u00f3\t\u00f4\t\u00f5\t\u00f6\t\u00f7\t\u00f8\t\u00f9\t\u00fa\t\u00fb\t\u00fc\t\u00fd\t\u00fe\t\u00ff", + "text": " ¡\t¢\t£\t¤\t¥\t¦\t§\t¨\t©\tª\t«\t¬\tSHY\t®\t¯\n°\t±\t²\t³\t´\tµ\t¶\t·\t¸\t¹\tº\t»\t¼\t½\t¾\t¿\nÀ\tÁ\tÂ\tÃ\tÄ\tÅ\tÆ\tÇ\tÈ\tÉ\tÊ\tË\tÌ\tÍ\tÎ\tÏ\nÐ\tÑ\tÒ\tÓ\tÔ\tÕ\tÖ\t×\tØ\tÙ\tÚ\tÛ\tÜ\tÝ\tÞ\tß\nà\tá\tâ\tã\tä\tå\tæ\tç\tè\té\tê\të\tì\tí\tî\tï\nð\tñ\tò\tó\tô\tõ\tö\t÷\tø\tù\tú\tû\tü\tý\tþ\tÿ", "metadata": { "languages": [ "por", diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json index c71cf50967..b0354dcb4a 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json @@ -46,7 +46,7 @@ { "type": "Table", "element_id": "dddac446da6c93dc1449ecb5d997c423", - "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century \u2018TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents", + "text": "Dataset | Base Model\" Large Model | Notes PubLayNet [38] P/M M Layouts of modern scientific documents PRImA [3) M - Layouts of scanned modern magazines and scientific reports Newspaper [17] P - Layouts of scanned US newspapers from the 20th century ‘TableBank (18) P P Table region on modern scientific and business document HJDataset (31) | F/M - Layouts of history Japanese documents", "metadata": { "text_as_html": "
    Dataset| Base Model!|Large Model| Notes
    PubLayNet [33]P/MMLayouts of modern scientific documents
    PRImA [3]MLayouts of scanned modern magazines and scientific reports
    Newspaper [17]PLayouts of scanned US newspapers from the 20th century
    TableBank [18]PTable region on modern scientific and business document
    HIDataset [31]P/MLayouts of history Japanese documents
    ", "filetype": "image/jpeg", @@ -69,7 +69,7 @@ { "type": "FigureCaption", "element_id": "a0c3c6b7e1e8c95016b989ef43c5ea2e", - "text": "2 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy vs. computational cost). For \u201cbase model\u201d and \u201clarge model\u201d, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (P) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has m Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months.", + "text": "2 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy vs. computational cost). For “base model” and “large model”, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (P) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has m Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months.", "metadata": { "filetype": "image/jpeg", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index 3f42ca335d..7c0e7324d2 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -244,7 +244,7 @@ { "type": "Title", "element_id": "d3be9e3d661e2a79f37257caa5b54d8c", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for Deep Learning Based Document Image Analysis", + "text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis", "metadata": { "filetype": "application/pdf", "languages": [ @@ -266,7 +266,7 @@ { "type": "NarrativeText", "element_id": "7cf062c1ba64938cc68c4fae61506d84", - "text": "Zejiang Shen! (4), Ruochen Zhang\u201d, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson\u2019, and Weining Li>", + "text": "Zejiang Shen! (4), Ruochen Zhang”, Melissa Dell?, Benjamin Charles Germain Lee*, Jacob Carlson’, and Weining Li>", "metadata": { "filetype": "application/pdf", "languages": [ @@ -310,7 +310,7 @@ { "type": "NarrativeText", "element_id": "f1169388c7749db52e388e2fe4feaec6", - "text": "Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model con\ufb01gurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going e\ufb00orts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.", + "text": "Abstract. Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of im- portant innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applica- tions. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout de- tection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digiti- zation pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-word use cases. The library is publicly available at https://layout-parser.github.io.", "metadata": { "links": [ { @@ -339,7 +339,7 @@ { "type": "NarrativeText", "element_id": "caffc7480fdd82a089ae387e01aabdb9", - "text": "Keywords: Document Image Analysis \u00b7 Deep Learning \u00b7 Layout Analysis \u00b7 Character Recognition \u00b7 Open Source library \u00b7 Toolkit.", + "text": "Keywords: Document Image Analysis · Deep Learning · Layout Analysis · Character Recognition · Open Source library · Toolkit.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -383,7 +383,7 @@ { "type": "NarrativeText", "element_id": "8de96d1e80af35f9b6954252e14c2caf", - "text": "Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classi\ufb01cation [11,", + "text": "Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classification [11,", "metadata": { "links": [ { @@ -434,7 +434,7 @@ { "type": "NarrativeText", "element_id": "4b097cc42d7d30e720512dbce0cb4905", - "text": "37], layout detection [38, 22], table detection [26], and scene text detection [4]. A generalized learning-based framework dramatically reduces the need for the manual speci\ufb01cation of complicated rules, which is the status quo with traditional methods. DL has the potential to transform DIA pipelines and bene\ufb01t a broad spectrum of large-scale document digitization projects.", + "text": "37], layout detection [38, 22], table detection [26], and scene text detection [4]. A generalized learning-based framework dramatically reduces the need for the manual specification of complicated rules, which is the status quo with traditional methods. DL has the potential to transform DIA pipelines and benefit a broad spectrum of large-scale document digitization projects.", "metadata": { "links": [ { @@ -483,7 +483,7 @@ { "type": "NarrativeText", "element_id": "45844a4901777afaf6de9a0994e017eb", - "text": "However, there are several practical di\ufb03culties for taking advantages of re- cent advances in DL-based methods: 1) DL models are notoriously convoluted for reuse and extension. Existing models are developed using distinct frame- works like TensorFlow [1] or PyTorch [24], and the high-level parameters can be obfuscated by implementation details [8]. It can be a time-consuming and frustrating experience to debug, reproduce, and adapt existing models for DIA, and many researchers who would bene\ufb01t the most from using these methods lack the technical background to implement them from scratch. 2) Document images contain diverse and disparate patterns across domains, and customized training is often required to achieve a desirable detection accuracy. Currently there is no full-\ufb02edged infrastructure for easily curating the target document image datasets and \ufb01ne-tuning or re-training the models. 3) DIA usually requires a sequence of models and other processing to obtain the \ufb01nal outputs. Often research teams use DL models and then perform further document analyses in separate processes, and these pipelines are not documented in any central location (and often not documented at all). This makes it di\ufb03cult for research teams to learn about how full pipelines are implemented and leads them to invest signi\ufb01cant resources in reinventing the DIA wheel.", + "text": "However, there are several practical difficulties for taking advantages of re- cent advances in DL-based methods: 1) DL models are notoriously convoluted for reuse and extension. Existing models are developed using distinct frame- works like TensorFlow [1] or PyTorch [24], and the high-level parameters can be obfuscated by implementation details [8]. It can be a time-consuming and frustrating experience to debug, reproduce, and adapt existing models for DIA, and many researchers who would benefit the most from using these methods lack the technical background to implement them from scratch. 2) Document images contain diverse and disparate patterns across domains, and customized training is often required to achieve a desirable detection accuracy. Currently there is no full-fledged infrastructure for easily curating the target document image datasets and fine-tuning or re-training the models. 3) DIA usually requires a sequence of models and other processing to obtain the final outputs. Often research teams use DL models and then perform further document analyses in separate processes, and these pipelines are not documented in any central location (and often not documented at all). This makes it difficult for research teams to learn about how full pipelines are implemented and leads them to invest significant resources in reinventing the DIA wheel.", "metadata": { "links": [ { @@ -522,7 +522,7 @@ { "type": "NarrativeText", "element_id": "6f3c8d55dd5a4f95d8a59d146ca9ffa7", - "text": "LayoutParser provides a uni\ufb01ed toolkit to support DL-based document image analysis and processing. To address the aforementioned challenges, LayoutParser is built with the following components:", + "text": "LayoutParser provides a unified toolkit to support DL-based document image analysis and processing. To address the aforementioned challenges, LayoutParser is built with the following components:", "metadata": { "filetype": "application/pdf", "languages": [ @@ -544,7 +544,7 @@ { "type": "ListItem", "element_id": "9ce12a49c1a9972b4cd2c3f66595b2b6", - "text": "1. An o\ufb00-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)", + "text": "1. An off-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -566,7 +566,7 @@ { "type": "ListItem", "element_id": "40f42a96bdd1559e09d74090c0fe9df3", - "text": "2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the o\ufb00-the-shelf usage", + "text": "2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage", "metadata": { "filetype": "application/pdf", "languages": [ @@ -588,7 +588,7 @@ { "type": "ListItem", "element_id": "0ca448d3ae0c4ee73bf46e8edfcd417d", - "text": "3. Comprehensive tools for e\ufb03cient document image data annotation and model tuning to support di\ufb00erent levels of customization", + "text": "3. Comprehensive tools for efficient document image data annotation and model tuning to support different levels of customization", "metadata": { "filetype": "application/pdf", "languages": [ @@ -632,7 +632,7 @@ { "type": "NarrativeText", "element_id": "8e216e91ff3471241858f1df445cdf0a", - "text": "The library implements simple and intuitive Python APIs without sacri\ufb01cing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will bene\ufb01t a variety of end-users, and will lead to advances in applications in both industry and academic research.", + "text": "The library implements simple and intuitive Python APIs without sacrificing generalizability and versatility, and can be easily installed via pip. Its convenient functions for handling document image data can be seamlessly integrated with existing DIA pipelines. With detailed documentations and carefully curated tutorials, we hope this tool will benefit a variety of end-users, and will lead to advances in applications in both industry and academic research.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -654,7 +654,7 @@ { "type": "NarrativeText", "element_id": "583775f22c8080098beebbef960e2fbf", - "text": "LayoutParser is well aligned with recent e\ufb00orts for improving DL model reusability in other disciplines like natural language processing [8, 34] and com- puter vision [35], but with a focus on unique challenges in DIA. We show LayoutParser can be applied in sophisticated and large-scale digitization projects", + "text": "LayoutParser is well aligned with recent efforts for improving DL model reusability in other disciplines like natural language processing [8, 34] and com- puter vision [35], but with a focus on unique challenges in DIA. We show LayoutParser can be applied in sophisticated and large-scale digitization projects", "metadata": { "links": [ { @@ -693,7 +693,7 @@ { "type": "Header", "element_id": "f5a6697190c20bf6030d8e4ae8f6861a", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Unified Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -715,7 +715,7 @@ { "type": "NarrativeText", "element_id": "50846086f4d9ece02052735686278699", - "text": "that require precision, e\ufb03ciency, and robustness, as well as simple and light- weight document processing tasks focusing on e\ufb03cacy and \ufb02exibility (Section 5). LayoutParser is being actively maintained, and support for more deep learning models and novel methods in text-based layout analysis methods [37, 34] is planned.", + "text": "that require precision, efficiency, and robustness, as well as simple and light- weight document processing tasks focusing on efficacy and flexibility (Section 5). LayoutParser is being actively maintained, and support for more deep learning models and novel methods in text-based layout analysis methods [37, 34] is planned.", "metadata": { "links": [ { @@ -825,7 +825,7 @@ { "type": "NarrativeText", "element_id": "8153390c1bb8652313be64034531449e", - "text": "Recently, various DL models and datasets have been developed for layout analysis tasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen- tation tasks on historical documents. Object detection-based methods like Faster R-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38] and detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also been used in table detection [27]. However, these models are usually implemented individually and there is no uni\ufb01ed framework to load and use such models.", + "text": "Recently, various DL models and datasets have been developed for layout analysis tasks. The dhSegment [22] utilizes fully convolutional networks [20] for segmen- tation tasks on historical documents. Object detection-based methods like Faster R-CNN [28] and Mask R-CNN [12] are used for identifying document elements [38] and detecting tables [30, 26]. Most recently, Graph Neural Networks [29] have also been used in table detection [27]. However, these models are usually implemented individually and there is no unified framework to load and use such models.", "metadata": { "links": [ { @@ -973,7 +973,7 @@ { "type": "NarrativeText", "element_id": "73feaff827cbc7089d3f95d1e5aac6aa", - "text": "Recent years have also seen numerous e\ufb00orts to create libraries for promoting reproducibility and reusability in the \ufb01eld of DL. Libraries like Dectectron2 [35],", + "text": "Recent years have also seen numerous efforts to create libraries for promoting reproducibility and reusability in the field of DL. Libraries like Dectectron2 [35],", "metadata": { "links": [ { @@ -1002,7 +1002,7 @@ { "type": "Footer", "element_id": "b1fa4bbd1bdda08489faab5bf3adf5cc", - "text": "6 The number shown is obtained by specifying the search type as \u2018code\u2019.", + "text": "6 The number shown is obtained by specifying the search type as ‘code’.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1200,7 +1200,7 @@ { "type": "Image", "element_id": "642416e5d6c99219b16dbba6f72392c5", - "text": "Efficient Data Annotation Model Customization Document Images Community Platform \u2018a >) \u00a5 DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | \u2014\u2014= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY", + "text": "Efficient Data Annotation Model Customization Document Images Community Platform ‘a >) ¥ DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | ——= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1222,7 +1222,7 @@ { "type": "NarrativeText", "element_id": "466f0bc21599ccf0fa27c021cb023f90", - "text": "Fig.1: The overall architecture of LayoutParser. For an input document image, the core LayoutParser library provides a set of o\ufb00-the-shelf tools for layout detection, OCR, visualization, and storage, backed by a carefully designed layout data structure. LayoutParser also supports high level customization via e\ufb03cient layout annotation and model training functions. These improve model accuracy on the target samples. The community platform enables the easy sharing of DIA models and whole digitization pipelines to promote reusability and reproducibility. A collection of detailed documentation, tutorials and exemplar projects make LayoutParser easy to learn and use.", + "text": "Fig.1: The overall architecture of LayoutParser. For an input document image, the core LayoutParser library provides a set of off-the-shelf tools for layout detection, OCR, visualization, and storage, backed by a carefully designed layout data structure. LayoutParser also supports high level customization via efficient layout annotation and model training functions. These improve model accuracy on the target samples. The community platform enables the easy sharing of DIA models and whole digitization pipelines to promote reusability and reproducibility. A collection of detailed documentation, tutorials and exemplar projects make LayoutParser easy to learn and use.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1244,7 +1244,7 @@ { "type": "NarrativeText", "element_id": "b4948db85ca791e99aa92589fc41734f", - "text": "AllenNLP [8] and transformers [34] have provided the community with complete DL-based support for developing and deploying models for general computer vision and natural language processing problems. LayoutParser, on the other hand, specializes speci\ufb01cally in DIA tasks. LayoutParser is also equipped with a community platform inspired by established model hubs such as Torch Hub [23] and TensorFlow Hub [1]. It enables the sharing of pretrained models as well as full document processing pipelines that are unique to DIA tasks.", + "text": "AllenNLP [8] and transformers [34] have provided the community with complete DL-based support for developing and deploying models for general computer vision and natural language processing problems. LayoutParser, on the other hand, specializes specifically in DIA tasks. LayoutParser is also equipped with a community platform inspired by established model hubs such as Torch Hub [23] and TensorFlow Hub [1]. It enables the sharing of pretrained models as well as full document processing pipelines that are unique to DIA tasks.", "metadata": { "links": [ { @@ -1288,7 +1288,7 @@ { "type": "NarrativeText", "element_id": "7651db80014a85ab253367d3bd3e4f88", - "text": "There have been a variety of document data collections to facilitate the development of DL models. Some examples include PRImA [3](magazine layouts), PubLayNet [38](academic paper layouts), Table Bank [18](tables in academic papers), Newspaper Navigator Dataset [16, 17](newspaper \ufb01gure layouts) and HJDataset [31](historical Japanese document layouts). A spectrum of models trained on these datasets are currently available in the LayoutParser model zoo to support di\ufb00erent use cases.", + "text": "There have been a variety of document data collections to facilitate the development of DL models. Some examples include PRImA [3](magazine layouts), PubLayNet [38](academic paper layouts), Table Bank [18](tables in academic papers), Newspaper Navigator Dataset [16, 17](newspaper figure layouts) and HJDataset [31](historical Japanese document layouts). A spectrum of models trained on these datasets are currently available in the LayoutParser model zoo to support different use cases.", "metadata": { "links": [ { @@ -1364,7 +1364,7 @@ { "type": "NarrativeText", "element_id": "47e45d28d96fc14ddc709835de35ece5", - "text": "At the core of LayoutParser is an o\ufb00-the-shelf toolkit that streamlines DL- based document image analysis. Five components support a simple interface with comprehensive functionalities: 1) The layout detection models enable using pre-trained or self-trained DL models for layout detection with just four lines of code. 2) The detected layout information is stored in carefully engineered", + "text": "At the core of LayoutParser is an off-the-shelf toolkit that streamlines DL- based document image analysis. Five components support a simple interface with comprehensive functionalities: 1) The layout detection models enable using pre-trained or self-trained DL models for layout detection with just four lines of code. 2) The detected layout information is stored in carefully engineered", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1386,7 +1386,7 @@ { "type": "ListItem", "element_id": "cd1112d2b15a0d27a29b1c83b2afd0dd", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Unified Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1430,7 +1430,7 @@ { "type": "Table", "element_id": "cb534ba64da736dc53d60b660f5e1153", - "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] F / M M Layouts of modern scienti\ufb01c documents PRImA [3] M - Layouts of scanned modern magazines and scienti\ufb01c reports Newspaper [17] F - Layouts of scanned US newspapers from the 20th century TableBank [18] F F Table region on modern scienti\ufb01c and business document HJDataset [31] F / M - Layouts of history Japanese documents", + "text": "Dataset Base Model1 Large Model Notes PubLayNet [38] F / M M Layouts of modern scientific documents PRImA [3] M - Layouts of scanned modern magazines and scientific reports Newspaper [17] F - Layouts of scanned US newspapers from the 20th century TableBank [18] F F Table region on modern scientific and business document HJDataset [31] F / M - Layouts of history Japanese documents", "metadata": { "links": [ { @@ -1480,7 +1480,7 @@ { "type": "FigureCaption", "element_id": "f978160527177fa39c13774ec8dfa9cb", - "text": "1 For each dataset, we train several models of di\ufb00erent sizes for di\ufb00erent needs (the trade-o\ufb00 between accuracy vs. computational cost). For \u201cbase model\u201d and \u201clarge model\u201d, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of di\ufb00erent architectures, like Faster R-CNN [28] (F) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months.", + "text": "1 For each dataset, we train several models of different sizes for different needs (the trade-off between accuracy vs. computational cost). For “base model” and “large model”, we refer to using the ResNet 50 or ResNet 101 backbones [13], respectively. One can train models of different architectures, like Faster R-CNN [28] (F) and Mask R-CNN [12] (M). For example, an F in the Large Model column indicates it has a Faster R-CNN model trained using the ResNet 101 backbone. The platform is maintained and a number of additions will be made to the model zoo in coming months.", "metadata": { "links": [ { @@ -1519,7 +1519,7 @@ { "type": "NarrativeText", "element_id": "55b33df7609960c3552a0b7bc1a5a9c6", - "text": "layout data structures, which are optimized for e\ufb03ciency and versatility. 3) When necessary, users can employ existing or customized OCR models via the uni\ufb01ed API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component.", + "text": "layout data structures, which are optimized for efficiency and versatility. 3) When necessary, users can employ existing or customized OCR models via the unified API provided in the OCR module. 4) LayoutParser comes with a set of utility functions for the visualization and storage of the layout data. 5) LayoutParser is also highly customizable, via its integration with functions for layout data annotation and model training. We now provide detailed descriptions for each component.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1563,7 +1563,7 @@ { "type": "NarrativeText", "element_id": "bbcc10c2b92de0cbdce8629f18b0d7ad", - "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Di\ufb00erent from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:", + "text": "In LayoutParser, a layout model takes a document image as an input and generates a list of rectangular boxes for the target content regions. Different from traditional methods, it relies on deep convolutional neural networks rather than manually curated rules to identify content regions. It is formulated as an object detection problem and state-of-the-art models like Faster R-CNN [28] and Mask R-CNN [12] are used. This yields prediction results of high accuracy and makes it possible to build a concise, generalized interface for layout detection. LayoutParser, built upon Detectron2 [35], provides a minimal API that can perform layout detection with only four lines of code in Python:", "metadata": { "links": [ { @@ -1690,7 +1690,7 @@ { "type": "NarrativeText", "element_id": "f888c5e8f5b1339f2af75612ea13c719", - "text": "LayoutParser provides a wealth of pre-trained model weights using various datasets covering di\ufb00erent languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are signi\ufb01cantly di\ufb00erent from the training dataset. As document structures and layouts vary greatly in di\ufb00erent domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp:///.", + "text": "LayoutParser provides a wealth of pre-trained model weights using various datasets covering different languages, time periods, and document types. Due to domain shift [7], the prediction performance can notably drop when models are ap- plied to target samples that are significantly different from the training dataset. As document structures and layouts vary greatly in different domains, it is important to select models trained on a dataset similar to the test samples. A semantic syntax is used for initializing the model weights in LayoutParser, using both the dataset name and model name lp:///.", "metadata": { "links": [ { @@ -1763,7 +1763,7 @@ { "type": "Image", "element_id": "6eb2bb6ca50b3be177565f9ff546bce8", - "text": "- \u00b0 . 3 a a 4 a 3 oo er \u2018 2 \u00a7 8 a 8 3 3 \u2018 \u00a3 4 A g a 9 \u2018 3 \u00a5 Coordinate g 4 5 3 + \u00a7 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower \u00b0 & a \u00a2 o [ coordinatel textblock1, 3 3 \u2019 g Q 3 , textblock2 , layoutl ] 4 q \u00ae A list of the layout elements Ff", + "text": "- ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1785,7 +1785,7 @@ { "type": "FigureCaption", "element_id": "9f11aa6b22dea1bba7eb0d122c0c5562", - "text": "Fig.2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum \ufb02exibility.", + "text": "Fig.2: The relationship between the three types of layout data structures. Coordinate supports three kinds of variation; TextBlock consists of the co- ordinate information and extra features like block text, types, and reading orders; a Layout object is a list of all possible layout elements, including other Layout objects. They all support the same set of transformation and operation APIs for maximum flexibility.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1807,7 +1807,7 @@ { "type": "NarrativeText", "element_id": "d997f63fd79c7e03050ca01b58dfdf0a", - "text": "Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained on 5 di\ufb00erent datasets. Description of the training dataset is provided alongside with the trained models such that users can quickly identify the most suitable models for their tasks. Additionally, when such a model is not readily available, LayoutParser also supports training customized layout models and community sharing of the models (detailed in Section 3.5).", + "text": "Shown in Table 1, LayoutParser currently hosts 9 pre-trained models trained on 5 different datasets. Description of the training dataset is provided alongside with the trained models such that users can quickly identify the most suitable models for their tasks. Additionally, when such a model is not readily available, LayoutParser also supports training customized layout models and community sharing of the models (detailed in Section 3.5).", "metadata": { "links": [ { @@ -1858,7 +1858,7 @@ { "type": "NarrativeText", "element_id": "601f7d95172984c75de081023ca64c15", - "text": "A critical feature of LayoutParser is the implementation of a series of data structures and operations that can be used to e\ufb03ciently process and manipulate the layout elements. In document image analysis pipelines, various post-processing on the layout analysis model outputs is usually required to obtain the \ufb01nal outputs. Traditionally, this requires exporting DL model outputs and then loading the results into other pipelines. All model outputs from LayoutParser will be stored in carefully engineered data types optimized for further processing, which makes it possible to build an end-to-end document digitization pipeline within LayoutParser. There are three key components in the data structure, namely the Coordinate system, the TextBlock, and the Layout. They provide di\ufb00erent levels of abstraction for the layout data, and a set of APIs are supported for transformations or operations on these classes.", + "text": "A critical feature of LayoutParser is the implementation of a series of data structures and operations that can be used to efficiently process and manipulate the layout elements. In document image analysis pipelines, various post-processing on the layout analysis model outputs is usually required to obtain the final outputs. Traditionally, this requires exporting DL model outputs and then loading the results into other pipelines. All model outputs from LayoutParser will be stored in carefully engineered data types optimized for further processing, which makes it possible to build an end-to-end document digitization pipeline within LayoutParser. There are three key components in the data structure, namely the Coordinate system, the TextBlock, and the Layout. They provide different levels of abstraction for the layout data, and a set of APIs are supported for transformations or operations on these classes.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1880,7 +1880,7 @@ { "type": "ListItem", "element_id": "48d58ed9a3d95637df68c8b810147ba1", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Unified Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1902,7 +1902,7 @@ { "type": "NarrativeText", "element_id": "dcdc0dc4759bd20c04026973cbe386e2", - "text": "Coordinates are the cornerstones for storing layout information. Currently, three types of Coordinate data structures are provided in LayoutParser, shown in Figure 2. Interval and Rectangle are the most common data types and support specifying 1D or 2D regions within a document. They are parameterized with 2 and 4 parameters. A Quadrilateral class is also implemented to support a more generalized representation of rectangular regions when the document is skewed or distorted, where the 4 corner points can be speci\ufb01ed and a total of 8 degrees of freedom are supported. A wide collection of transformations like shift, pad, and scale, and operations like intersect, union, and is_in, are supported for these classes. Notably, it is common to separate a segment of the image and analyze it individually. LayoutParser provides full support for this scenario via image cropping operations crop_image and coordinate transformations like relative_to and condition_on that transform coordinates to and from their relative representations. We refer readers to Table 2 for a more detailed description of these operations13.", + "text": "Coordinates are the cornerstones for storing layout information. Currently, three types of Coordinate data structures are provided in LayoutParser, shown in Figure 2. Interval and Rectangle are the most common data types and support specifying 1D or 2D regions within a document. They are parameterized with 2 and 4 parameters. A Quadrilateral class is also implemented to support a more generalized representation of rectangular regions when the document is skewed or distorted, where the 4 corner points can be specified and a total of 8 degrees of freedom are supported. A wide collection of transformations like shift, pad, and scale, and operations like intersect, union, and is_in, are supported for these classes. Notably, it is common to separate a segment of the image and analyze it individually. LayoutParser provides full support for this scenario via image cropping operations crop_image and coordinate transformations like relative_to and condition_on that transform coordinates to and from their relative representations. We refer readers to Table 2 for a more detailed description of these operations13.", "metadata": { "links": [ { @@ -1941,7 +1941,7 @@ { "type": "NarrativeText", "element_id": "3f620e1ad95cd446170613ed9d780853", - "text": "Based on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent \ufb01eld to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment e\ufb00ort.", + "text": "Based on Coordinates, we implement the TextBlock class that stores both the positional and extra features of individual layout elements. It also supports specifying the reading orders via setting the parent field to the index of the parent object. A Layout class is built that takes in a list of TextBlocks and supports processing the elements in batch. Layout can also be nested to support hierarchical layout structures. They support the same operations and transformations as the Coordinate classes, minimizing both learning and deployment effort.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -1985,7 +1985,7 @@ { "type": "NarrativeText", "element_id": "16565416942e53cf65f75a8a845df211", - "text": "LayoutParser provides a uni\ufb01ed interface for existing OCR tools. Though there are many OCR tools available, they are usually con\ufb01gured di\ufb00erently with distinct APIs or protocols for using them. It can be ine\ufb03cient to add new OCR tools into an existing pipeline, and di\ufb03cult to make direct comparisons among the available tools to \ufb01nd the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it e\ufb00ortless to switch, evaluate, and compare di\ufb00erent OCR modules:", + "text": "LayoutParser provides a unified interface for existing OCR tools. Though there are many OCR tools available, they are usually configured differently with distinct APIs or protocols for using them. It can be inefficient to add new OCR tools into an existing pipeline, and difficult to make direct comparisons among the available tools to find the best option for a particular project. To this end, LayoutParser builds a series of wrappers among existing OCR engines, and provides nearly the same syntax for using them. It supports a plug-and-play style of using OCR engines, making it effortless to switch, evaluate, and compare different OCR modules:", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2051,7 +2051,7 @@ { "type": "NarrativeText", "element_id": "fa023ccf2ac1042ef254ecf47cc592ca", - "text": "LayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained with the Connectionist Temporal Classi\ufb01cation (CTC) loss [10]. It can be used like the other OCR modules, and can be easily trained on customized datasets.", + "text": "LayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained with the Connectionist Temporal Classification (CTC) loss [10]. It can be used like the other OCR modules, and can be easily trained on customized datasets.", "metadata": { "links": [ { @@ -2129,7 +2129,7 @@ { "type": "NarrativeText", "element_id": "a5ce184b53898a543bca90a5b0acd156", - "text": "Table 2: All operations supported by the layout elements. The same APIs are supported across di\ufb00erent layout element classes including Coordinate types, TextBlock and Layout.", + "text": "Table 2: All operations supported by the layout elements. The same APIs are supported across different layout element classes including Coordinate types, TextBlock and Layout.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2151,9 +2151,9 @@ { "type": "Table", "element_id": "64bc79d1132a89c71837f420d6e4e2dc", - "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2\u2019s absolute coordinates block.crop image(image) Obtain the image segments in the block region", + "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2’s absolute coordinates block.crop image(image) Obtain the image segments in the block region", "metadata": { - "text_as_html": "
    block.pad(top, bottom,right,left)Enlarge the current block according to the input
    block.scale(fx, fy)Scale the current block given the ratio in x and y direction
    block.shift(dx, dy)Move the current block with the shift distances in x and y direction
    block1.is_in(block2)Whether block] is inside of block2
    block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
    block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2\u2019s absolute coordinates
    block. crop_image (image)Obtain the image segments in the block region
    ", + "text_as_html": "
    block.pad(top, bottom,right,left)Enlarge the current block according to the input
    block.scale(fx, fy)Scale the current block given the ratio in x and y direction
    block.shift(dx, dy)Move the current block with the shift distances in x and y direction
    block1.is_in(block2)Whether block] is inside of block2
    block1. intersect (block2)Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.union(block2)Return the union region of blockl and block2. Coordinate type to be determined based on the inputs
    block1.relative_to(block2)Convert the absolute coordinates of block to relative coordinates to block2
    block1.condition_on(block2)Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates
    block. crop_image (image)Obtain the image segments in the block region
    ", "filetype": "application/pdf", "languages": [ "eng" @@ -2196,7 +2196,7 @@ { "type": "NarrativeText", "element_id": "afa5f1dc8b4ce5598f278992d818eaa9", - "text": "The end goal of DIA is to transform the image-based document data into a structured database. LayoutParser supports exporting layout data into di\ufb00erent formats like JSON, csv, and will add the support for the METS/ALTO XML format 14 . It can also load datasets from layout analysis-speci\ufb01c formats like COCO [38] and the Page Format [25] for training layout models (Section 3.5).", + "text": "The end goal of DIA is to transform the image-based document data into a structured database. LayoutParser supports exporting layout data into different formats like JSON, csv, and will add the support for the METS/ALTO XML format 14 . It can also load datasets from layout analysis-specific formats like COCO [38] and the Page Format [25] for training layout models (Section 3.5).", "metadata": { "links": [ { @@ -2240,7 +2240,7 @@ { "type": "NarrativeText", "element_id": "28aeb996f497c9d01d06e564483d0854", - "text": "Visualization of the layout detection results is critical for both presentation and debugging. LayoutParser is built with an integrated API for displaying the layout information along with the original document image. Shown in Figure 3, it enables presenting layout data with rich meta information and features in di\ufb00erent modes. More detailed information can be found in the online LayoutParser documentation page.", + "text": "Visualization of the layout detection results is critical for both presentation and debugging. LayoutParser is built with an integrated API for displaying the layout information along with the original document image. Shown in Figure 3, it enables presenting layout data with rich meta information and features in different modes. More detailed information can be found in the online LayoutParser documentation page.", "metadata": { "links": [ { @@ -2291,7 +2291,7 @@ { "type": "NarrativeText", "element_id": "05e5f4e2a196db34263541d1ecebe297", - "text": "Besides the o\ufb00-the-shelf library, LayoutParser is also highly customizable with supports for highly unique and challenging document analysis tasks. Target document images can be vastly di\ufb00erent from the existing datasets for train- ing layout models, which leads to low layout detection accuracy. Training data", + "text": "Besides the off-the-shelf library, LayoutParser is also highly customizable with supports for highly unique and challenging document analysis tasks. Target document images can be vastly different from the existing datasets for train- ing layout models, which leads to low layout detection accuracy. Training data", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2335,7 +2335,7 @@ { "type": "ListItem", "element_id": "c069937e6c2bfc0f856835f3af4d6181", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Unified Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2379,7 +2379,7 @@ { "type": "NarrativeText", "element_id": "4d1b9566e792683b9559b778be4f4046", - "text": "Fig.3: Layout detection and OCR results visualization generated by the LayoutParser APIs. Mode I directly overlays the layout region bounding boxes and categories over the original image. Mode II recreates the original document via drawing the OCR\u2019d texts at their corresponding positions on the image canvas. In this \ufb01gure, tokens in textual regions are \ufb01ltered using the API and then displayed.", + "text": "Fig.3: Layout detection and OCR results visualization generated by the LayoutParser APIs. Mode I directly overlays the layout region bounding boxes and categories over the original image. Mode II recreates the original document via drawing the OCR’d texts at their corresponding positions on the image canvas. In this figure, tokens in textual regions are filtered using the API and then displayed.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2401,7 +2401,7 @@ { "type": "NarrativeText", "element_id": "625c9e1d41a9740f094041595f79953d", - "text": "can also be highly sensitive and not sharable publicly. To overcome these chal- lenges, LayoutParser is built with rich features for e\ufb03cient data annotation and customized model training.", + "text": "can also be highly sensitive and not sharable publicly. To overcome these chal- lenges, LayoutParser is built with rich features for efficient data annotation and customized model training.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2423,7 +2423,7 @@ { "type": "NarrativeText", "element_id": "a3498730b5cd3fe9405fad69bcf37882", - "text": "LayoutParser incorporates a toolkit optimized for annotating document lay- outs using object-level active learning [32]. With the help from a layout detection model trained along with labeling, only the most important layout objects within each image, rather than the whole image, are required for labeling. The rest of the regions are automatically annotated with high con\ufb01dence predictions from the layout detection model. This allows a layout dataset to be created more e\ufb03ciently with only around 60% of the labeling budget.", + "text": "LayoutParser incorporates a toolkit optimized for annotating document lay- outs using object-level active learning [32]. With the help from a layout detection model trained along with labeling, only the most important layout objects within each image, rather than the whole image, are required for labeling. The rest of the regions are automatically annotated with high confidence predictions from the layout detection model. This allows a layout dataset to be created more efficiently with only around 60% of the labeling budget.", "metadata": { "links": [ { @@ -2452,7 +2452,7 @@ { "type": "NarrativeText", "element_id": "c4ccf2cf2e7495668221cbe51534f90b", - "text": "After the training dataset is curated, LayoutParser supports di\ufb00erent modes for training the layout models. Fine-tuning can be used for training models on a small newly-labeled dataset by initializing the model with existing pre-trained weights. Training from scratch can be helpful when the source dataset and target are signi\ufb01cantly di\ufb00erent and a large training set is available. However, as suggested in Studer et al.\u2019s work[33], loading pre-trained weights on large-scale datasets like ImageNet [5], even from totally di\ufb00erent domains, can still boost model performance. Through the integrated API provided by LayoutParser, users can easily compare model performances on the benchmark datasets.", + "text": "After the training dataset is curated, LayoutParser supports different modes for training the layout models. Fine-tuning can be used for training models on a small newly-labeled dataset by initializing the model with existing pre-trained weights. Training from scratch can be helpful when the source dataset and target are significantly different and a large training set is available. However, as suggested in Studer et al.’s work[33], loading pre-trained weights on large-scale datasets like ImageNet [5], even from totally different domains, can still boost model performance. Through the integrated API provided by LayoutParser, users can easily compare model performances on the benchmark datasets.", "metadata": { "links": [ { @@ -2669,7 +2669,7 @@ { "type": "ListItem", "element_id": "ab543398222da25b3a9231929162d3a0", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Unified Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2691,7 +2691,7 @@ { "type": "NarrativeText", "element_id": "4b9eddb71426681f2828832312457b67", - "text": "focuses on precision, e\ufb03ciency, and robustness. The target documents may have complicated structures, and may require training multiple layout detection models to achieve the optimal accuracy. Light-weight pipelines are built for relatively simple documents, with an emphasis on development ease, speed and \ufb02exibility. Ideally one only needs to use existing resources, and model training should be avoided. Through two exemplar projects, we show how practitioners in both academia and industry can easily build such pipelines using LayoutParser and extract high-quality structured document data for their downstream tasks. The source code for these projects will be publicly available in the LayoutParser community hub.", + "text": "focuses on precision, efficiency, and robustness. The target documents may have complicated structures, and may require training multiple layout detection models to achieve the optimal accuracy. Light-weight pipelines are built for relatively simple documents, with an emphasis on development ease, speed and flexibility. Ideally one only needs to use existing resources, and model training should be avoided. Through two exemplar projects, we show how practitioners in both academia and industry can easily build such pipelines using LayoutParser and extract high-quality structured document data for their downstream tasks. The source code for these projects will be publicly available in the LayoutParser community hub.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2757,7 +2757,7 @@ { "type": "NarrativeText", "element_id": "76dd07abeb9f4bbcb77152deb52c9dc0", - "text": "In this example, LayoutParser was used to develop a comprehensive pipeline, shown in Figure 5, to gener- ate high-quality structured data from historical Japanese \ufb01rm \ufb01nancial ta- bles with complicated layouts. The pipeline applies two layout models to identify di\ufb00erent levels of document structures and two customized OCR engines for optimized character recog- nition accuracy.", + "text": "In this example, LayoutParser was used to develop a comprehensive pipeline, shown in Figure 5, to gener- ate high-quality structured data from historical Japanese firm financial ta- bles with complicated layouts. The pipeline applies two layout models to identify different levels of document structures and two customized OCR engines for optimized character recog- nition accuracy.", "metadata": { "links": [ { @@ -2786,7 +2786,7 @@ { "type": "NarrativeText", "element_id": "42551c9b40827dcdc52055b4d25c6fc3", - "text": "As shown in Figure 4 (a), the document contains columns of text written vertically 15, a common style in Japanese. Due to scanning noise and archaic printing technology, the columns can be skewed or have vari- able widths, and hence cannot be eas- ily identi\ufb01ed via rule-based methods. Within each column, words are sepa- rated by white spaces of variable size, and the vertical positions of objects can be an indicator of their layout type.", + "text": "As shown in Figure 4 (a), the document contains columns of text written vertically 15, a common style in Japanese. Due to scanning noise and archaic printing technology, the columns can be skewed or have vari- able widths, and hence cannot be eas- ily identified via rule-based methods. Within each column, words are sepa- rated by white spaces of variable size, and the vertical positions of objects can be an indicator of their layout type.", "metadata": { "links": [ { @@ -2820,7 +2820,7 @@ { "type": "Image", "element_id": "f48a844114951222f6c96331efc683fb", - "text": "(spe peepee, \u2018Active Learning Layout Annotate Layout Dataset | + \u2018Annotation Toolkit \u00a5 a Deep Leaming Layout Model Training & Inference, \u00a5 ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <\u2014\u2014 Default ane Customized \u00a5 ee Layout Structure Visualization & Export | <\u2014\u2014 | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules", + "text": "(spe peepee, ‘Active Learning Layout Annotate Layout Dataset | + ‘Annotation Toolkit ¥ a Deep Leaming Layout Model Training & Inference, ¥ ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <—— Default ane Customized ¥ ee Layout Structure Visualization & Export | <—— | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules", "metadata": { "filetype": "application/pdf", "languages": [ @@ -2930,7 +2930,7 @@ { "type": "NarrativeText", "element_id": "7e1f7b138c864ed8b40cf0f3d38801ec", - "text": "structure, two object detection models have been trained to recognize individual columns and tokens, respectively. A small training set (400 images with approxi- mately 100 annotations each) is curated via the active learning based annotation tool [32] in LayoutParser. The models learn to identify both the categories and regions for each token or column via their distinct visual features. The layout data structure enables easy grouping of the tokens within each column, and rearranging columns to achieve the correct reading orders based on the horizontal position. Errors are identi\ufb01ed and recti\ufb01ed via checking the consistency of the model predictions. Therefore, though trained on a small dataset, the pipeline achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19] score across 5 categories for the column detection model, and a 89.23 AP across 4 categories for the token detection model.", + "text": "structure, two object detection models have been trained to recognize individual columns and tokens, respectively. A small training set (400 images with approxi- mately 100 annotations each) is curated via the active learning based annotation tool [32] in LayoutParser. The models learn to identify both the categories and regions for each token or column via their distinct visual features. The layout data structure enables easy grouping of the tokens within each column, and rearranging columns to achieve the correct reading orders based on the horizontal position. Errors are identified and rectified via checking the consistency of the model predictions. Therefore, though trained on a small dataset, the pipeline achieves a high level of layout detection accuracy: it achieves a 96.97 AP [19] score across 5 categories for the column detection model, and a 89.23 AP across 4 categories for the token detection model.", "metadata": { "links": [ { @@ -2964,7 +2964,7 @@ { "type": "NarrativeText", "element_id": "dccaa93e7bae24dedf523dd39575dfbe", - "text": "A combination of character recognition methods is developed to tackle the unique challenges in this document. In our experiments, we found that irregular spacing between the tokens led to a low character recognition recall rate, whereas existing OCR models tend to perform better on densely-arranged texts. To overcome this challenge, we create a document reorganization algorithm that rearranges the text based on the token bounding boxes detected in the layout analysis step. Figure 4 (b) illustrates the generated image of dense text, which is sent to the OCR APIs as a whole to reduce the transaction costs. The \ufb02exible coordinate system in LayoutParser is used to transform the OCR results relative to their original positions on the page.", + "text": "A combination of character recognition methods is developed to tackle the unique challenges in this document. In our experiments, we found that irregular spacing between the tokens led to a low character recognition recall rate, whereas existing OCR models tend to perform better on densely-arranged texts. To overcome this challenge, we create a document reorganization algorithm that rearranges the text based on the token bounding boxes detected in the layout analysis step. Figure 4 (b) illustrates the generated image of dense text, which is sent to the OCR APIs as a whole to reduce the transaction costs. The flexible coordinate system in LayoutParser is used to transform the OCR results relative to their original positions on the page.", "metadata": { "links": [ { @@ -2993,7 +2993,7 @@ { "type": "NarrativeText", "element_id": "60c2e2147d0b0dbd576d51b71a95a2ef", - "text": "Additionally, it is common for historical documents to use unique fonts with di\ufb00erent glyphs, which signi\ufb01cantly degrades the accuracy of OCR models trained on modern texts. In this document, a special \ufb02at font is used for printing numbers and could not be detected by o\ufb00-the-shelf OCR engines. Using the highly \ufb02exible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal e\ufb00ort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identi\ufb01es characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set.", + "text": "Additionally, it is common for historical documents to use unique fonts with different glyphs, which significantly degrades the accuracy of OCR models trained on modern texts. In this document, a special flat font is used for printing numbers and could not be detected by off-the-shelf OCR engines. Using the highly flexible functionalities from LayoutParser, a pipeline approach is constructed that achieves a high recognition accuracy with minimal effort. As the characters have unique visual structures and are usually clustered together, we train the layout model to identify number regions with a dedicated category. Subsequently, LayoutParser crops images within these regions, and identifies characters within them using a self-trained OCR model based on a CNN-RNN [6]. The model detects a total of 15 possible categories, and achieves a 0.98 Jaccard score16 and a 0.17 average Levinstein distances17 for token prediction on the test set.", "metadata": { "links": [ { @@ -3032,7 +3032,7 @@ { "type": "NarrativeText", "element_id": "de9e855638523c5f77ed4070813e37a3", - "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate \ufb01ne-grained results that enable creative approaches like page reorganization for OCR.", + "text": "Overall, it is possible to create an intricate and highly accurate digitization pipeline for large-scale digitization using LayoutParser. The pipeline avoids specifying the complicated rules used in traditional methods, is straightforward to develop, and is robust to outliers. The DL models also generate fine-grained results that enable creative approaches like page reorganization for OCR.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3098,7 +3098,7 @@ { "type": "ListItem", "element_id": "2b7101f39954d5301166b82906202ea9", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Unified Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3142,7 +3142,7 @@ { "type": "FigureCaption", "element_id": "d35d253341e8b8d837f384ecd6ac410a", - "text": "Fig.6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in di\ufb00erent locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.", + "text": "Fig.6: This lightweight table detector can identify tables (outlined in red) and cells (shaded in blue) in different locations on a page. In very few cases (d), it might generate minor error predictions, e.g, failing to capture the top text line of a table.", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3186,7 +3186,7 @@ { "type": "NarrativeText", "element_id": "445ad333fa3f7f85d2be634fbdeeb72a", - "text": "Detecting tables and parsing their structures (table extraction) are of central im- portance for many document digitization tasks. Many previous works [26, 30, 27] and tools 18 have been developed to identify and parse table structures. Yet they might require training complicated models from scratch, or are only applicable for born-digital PDF documents. In this section, we show how LayoutParser can help build a light-weight accurate visual table extractor for legal docket tables using the existing resources with minimal e\ufb00ort.", + "text": "Detecting tables and parsing their structures (table extraction) are of central im- portance for many document digitization tasks. Many previous works [26, 30, 27] and tools 18 have been developed to identify and parse table structures. Yet they might require training complicated models from scratch, or are only applicable for born-digital PDF documents. In this section, we show how LayoutParser can help build a light-weight accurate visual table extractor for legal docket tables using the existing resources with minimal effort.", "metadata": { "links": [ { @@ -3230,7 +3230,7 @@ { "type": "NarrativeText", "element_id": "923b62eb8550ec49cf6d3f2e6bac7ec8", - "text": "The extractor uses a pre-trained layout detection model for identifying the table regions and some simple rules for pairing the rows and the columns in the PDF image. Mask R-CNN [12] trained on the PubLayNet dataset [38] from the LayoutParser Model Zoo can be used for detecting table regions. By \ufb01ltering out model predictions of low con\ufb01dence and removing overlapping predictions, LayoutParser can identify the tabular regions on each page, which signi\ufb01cantly simpli\ufb01es the subsequent steps. By applying the line detection functions within the tabular segments, provided in the utility module from LayoutParser, the pipeline can identify the three distinct columns in the tables. A row clustering method is then applied via analyzing the y coordinates of token bounding boxes in the left-most column, which are obtained from the OCR engines. A non-maximal suppression algorithm is used to remove duplicated rows with extremely small gaps. Shown in Figure 6, the built pipeline can detect tables at di\ufb00erent positions on a page accurately. Continued tables from di\ufb00erent pages are concatenated, and a structured table representation has been easily created.", + "text": "The extractor uses a pre-trained layout detection model for identifying the table regions and some simple rules for pairing the rows and the columns in the PDF image. Mask R-CNN [12] trained on the PubLayNet dataset [38] from the LayoutParser Model Zoo can be used for detecting table regions. By filtering out model predictions of low confidence and removing overlapping predictions, LayoutParser can identify the tabular regions on each page, which significantly simplifies the subsequent steps. By applying the line detection functions within the tabular segments, provided in the utility module from LayoutParser, the pipeline can identify the three distinct columns in the tables. A row clustering method is then applied via analyzing the y coordinates of token bounding boxes in the left-most column, which are obtained from the OCR engines. A non-maximal suppression algorithm is used to remove duplicated rows with extremely small gaps. Shown in Figure 6, the built pipeline can detect tables at different positions on a page accurately. Continued tables from different pages are concatenated, and a structured table representation has been easily created.", "metadata": { "links": [ { @@ -3335,7 +3335,7 @@ { "type": "NarrativeText", "element_id": "e79cef57c86050aa5fc74e5cd3923197", - "text": "LayoutParser provides a comprehensive toolkit for deep learning-based document image analysis. The o\ufb00-the-shelf library is easy to install, and can be used to build \ufb02exible and accurate pipelines for processing documents with complicated structures. It also supports high-level customization and enables easy labeling and training of DL models on unique document image datasets. The LayoutParser community platform facilitates sharing DL models and DIA pipelines, inviting discussion and promoting code reproducibility and reusability. The LayoutParser team is committed to keeping the library updated continuously and bringing the most recent advances in DL-based DIA, such as multi-modal document modeling [37, 36, 9] (an upcoming priority), to a diverse audience of end-users.", + "text": "LayoutParser provides a comprehensive toolkit for deep learning-based document image analysis. The off-the-shelf library is easy to install, and can be used to build flexible and accurate pipelines for processing documents with complicated structures. It also supports high-level customization and enables easy labeling and training of DL models on unique document image datasets. The LayoutParser community platform facilitates sharing DL models and DIA pipelines, inviting discussion and promoting code reproducibility and reusability. The LayoutParser team is committed to keeping the library updated continuously and bringing the most recent advances in DL-based DIA, such as multi-modal document modeling [37, 36, 9] (an upcoming priority), to a diverse audience of end-users.", "metadata": { "links": [ { @@ -3418,7 +3418,7 @@ { "type": "ListItem", "element_id": "85e09a5617e58a3a78b22fd12eb29eaf", - "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man\u00b4e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi\u00b4egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), https://www.tensorflow.org/, software available from tensor\ufb02ow.org", + "text": "[1] Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin, M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur, M., Levenberg, J., Man´e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B., Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi´egas, F., Vinyals, O., Warden, P., Wattenberg, M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015), https://www.tensorflow.org/, software available from tensorflow.org", "metadata": { "links": [ { @@ -3447,7 +3447,7 @@ { "type": "ListItem", "element_id": "ad466edc2a12c9be4bf951fd8b5bf818", - "text": "[2] Alberti, M., Pondenkandath, V., W\u00a8ursch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423\u2013428. IEEE (2018)", + "text": "[2] Alberti, M., Pondenkandath, V., W¨ursch, M., Ingold, R., Liwicki, M.: Deepdiva: a highly-functional python framework for reproducible experiments. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 423–428. IEEE (2018)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3469,7 +3469,7 @@ { "type": "ListItem", "element_id": "217777f3d44620afddc1e27553e81a66", - "text": "[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296\u2013300. IEEE (2009)", + "text": "[3] Antonacopoulos, A., Bridson, D., Papadopoulos, C., Pletschacher, S.: A realistic dataset for performance evaluation of document layout analysis. In: 2009 10th International Conference on Document Analysis and Recognition. pp. 296–300. IEEE (2009)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3491,7 +3491,7 @@ { "type": "ListItem", "element_id": "292dd088dc6a174159395e31be7755d7", - "text": "[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365\u20139374 (2019)", + "text": "[4] Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition. pp. 9365–9374 (2019)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3535,7 +3535,7 @@ { "type": "ListItem", "element_id": "4e93c51c89970349aa9e0a42cb330c4b", - "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-\ufb01ne attention. In: International Conference on Machine Learning. pp. 980\u2013989. PMLR (2017)", + "text": "[6] Deng, Y., Kanervisto, A., Ling, J., Rush, A.M.: Image-to-markup generation with coarse-to-fine attention. In: International Conference on Machine Learning. pp. 980–989. PMLR (2017)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3557,7 +3557,7 @@ { "type": "ListItem", "element_id": "8cfd166d282469f765423faae44271e2", - "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180\u20131189. PMLR (2015)", + "text": "[7] Ganin, Y., Lempitsky, V.: Unsupervised domain adaptation by backpropagation. In: International conference on machine learning. pp. 1180–1189. PMLR (2015)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3579,7 +3579,7 @@ { "type": "ListItem", "element_id": "8bce49aab693aad97676011688f3f6f3", - "text": "LayoutParser: A Uni\ufb01ed Toolkit for DL-Based DIA", + "text": "LayoutParser: A Unified Toolkit for DL-Based DIA", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3645,7 +3645,7 @@ { "type": "ListItem", "element_id": "95bc71fb3542f420dfa50e22eb8c734f", - "text": "[10] Graves, A., Fern\u00b4andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classi\ufb01cation: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369\u2013376 (2006)", + "text": "[10] Graves, A., Fern´andez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: Proceedings of the 23rd international conference on Machine learning. pp. 369–376 (2006)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3667,7 +3667,7 @@ { "type": "ListItem", "element_id": "3fab75481d8e6d389ea6034e18f54e00", - "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classi\ufb01cation and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991\u2013995. IEEE (2015)", + "text": "[11] Harley, A.W., Ufkes, A., Derpanis, K.G.: Evaluation of deep convolutional nets for document image classification and retrieval. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR). pp. 991–995. IEEE (2015)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3689,7 +3689,7 @@ { "type": "ListItem", "element_id": "8cd8821b71e4bda1a77f6a114ff54f50", - "text": "[12] He, K., Gkioxari, G., Doll\u00b4ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision. pp. 2961\u20132969 (2017)", + "text": "[12] He, K., Gkioxari, G., Doll´ar, P., Girshick, R.: Mask r-cnn. In: Proceedings of the IEEE international conference on computer vision. pp. 2961–2969 (2017)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3711,7 +3711,7 @@ { "type": "ListItem", "element_id": "02c0a0c6c60503798f3894fe244c237d", - "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770\u2013778 (2016)", + "text": "[13] He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770–778 (2016)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3755,7 +3755,7 @@ { "type": "ListItem", "element_id": "bd2e9f3795d8492cadde716193f62aba", - "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42\u201347. IEEE (2011)", + "text": "[15] Lamiroy, B., Lopresti, D.: An open architecture for end-to-end document analysis benchmarking. In: 2011 International Conference on Document Analysis and Recognition. pp. 42–47. IEEE (2011)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3777,7 +3777,7 @@ { "type": "ListItem", "element_id": "07cef8a161dd1c3f0895c605844d678e", - "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120\u2013122. UIST \u201920 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143", + "text": "[16] Lee, B.C., Weld, D.S.: Newspaper navigator: Open faceted search for 1.5 million images. In: Adjunct Publication of the 33rd Annual ACM Sym- posium on User Interface Software and Technology. p. 120–122. UIST ’20 Adjunct, Association for Computing Machinery, New York, NY, USA (2020). https://doi.org/10.1145/3379350.3416143, https://doi-org.offcampus. lib.washington.edu/10.1145/3379350.3416143", "metadata": { "links": [ { @@ -3816,7 +3816,7 @@ { "type": "ListItem", "element_id": "90ad04faa055039bfd37c1a851878048", - "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055\u20133062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767", + "text": "[17] Lee, B.C.G., Mears, J., Jakeway, E., Ferriter, M., Adams, C., Yarasavage, N., Thomas, D., Zwaard, K., Weld, D.S.: The Newspaper Navigator Dataset: Extracting Headlines and Visual Content from 16 Million Historic Newspaper Pages in Chronicling America, p. 3055–3062. Association for Computing Machinery, New York, NY, USA (2020), https://doi.org/10.1145/3340531.3412767", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3860,7 +3860,7 @@ { "type": "ListItem", "element_id": "b5e16aae3d43919bb5899fade72c0550", - "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00b4ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740\u2013755. Springer (2014)", + "text": "[19] Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll´ar, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: European conference on computer vision. pp. 740–755. Springer (2014)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3882,7 +3882,7 @@ { "type": "ListItem", "element_id": "8344e54a6acb25643c83b5ea96c5c593", - "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431\u20133440 (2015)", + "text": "[20] Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 3431–3440 (2015)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3904,7 +3904,7 @@ { "type": "ListItem", "element_id": "9476b030857c32e55a638928df6d01e8", - "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Su\ufb01, S., Williams, A., Wolsten- croft, K.: An experimental work\ufb02ow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161\u2013168 (2011)", + "text": "[21] Neudecker, C., Schlarb, S., Dogan, Z.M., Missier, P., Sufi, S., Williams, A., Wolsten- croft, K.: An experimental workflow development platform for historical document digitisation and analysis. In: Proceedings of the 2011 workshop on historical document imaging and processing. pp. 161–168 (2011)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3926,7 +3926,7 @@ { "type": "ListItem", "element_id": "4640c3f33351b994165071b6d872ef56", - "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7\u201312. IEEE (2018)", + "text": "[22] Oliveira, S.A., Seguin, B., Kaplan, F.: dhsegment: A generic deep-learning approach for document segmentation. In: 2018 16th International Conference on Frontiers in Handwriting Recognition (ICFHR). pp. 7–12. IEEE (2018)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -3970,7 +3970,7 @@ { "type": "ListItem", "element_id": "048415c6e5fc7bdd5466bf9c877b4a14", - "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic di\ufb00erentiation in pytorch (2017)", + "text": "[23] Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., Lerer, A.: Automatic differentiation in pytorch (2017)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4014,7 +4014,7 @@ { "type": "ListItem", "element_id": "a2f34eceb4f6036f105c6319de5450d1", - "text": "[25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257\u2013260. IEEE (2010)", + "text": "[25] Pletschacher, S., Antonacopoulos, A.: The page (page analysis and ground-truth elements) format framework. In: 2010 20th International Conference on Pattern Recognition. pp. 257–260. IEEE (2010)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4036,7 +4036,7 @@ { "type": "ListItem", "element_id": "c81432ac5c76b82c1ccd93d0a3ee15b1", - "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572\u2013573 (2020)", + "text": "[26] Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from image- based documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 572–573 (2020)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4058,7 +4058,7 @@ { "type": "ListItem", "element_id": "0f5cebf6a7661981062a59f24e0b2a3a", - "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142\u2013147. IEEE (2019)", + "text": "[27] Qasim, S.R., Mahmood, H., Shafait, F.: Rethinking table recognition using graph neural networks. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 142–147. IEEE (2019)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4080,7 +4080,7 @@ { "type": "ListItem", "element_id": "d02327f415141694d5853b57ac0f9e3f", - "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91\u201399 (2015)", + "text": "[28] Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: Towards real-time object detection with region proposal networks. In: Advances in neural information processing systems. pp. 91–99 (2015)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4102,7 +4102,7 @@ { "type": "ListItem", "element_id": "d0529ef231eeac2e8ae2083dee416210", - "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61\u201380 (2008)", + "text": "[29] Scarselli, F., Gori, M., Tsoi, A.C., Hagenbuchner, M., Monfardini, G.: The graph neural network model. IEEE transactions on neural networks 20(1), 61–80 (2008)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4124,7 +4124,7 @@ { "type": "ListItem", "element_id": "98fce7a2720ed7eda87a02659071b121", - "text": "[30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162\u20131167. IEEE (2017)", + "text": "[30] Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162–1167. IEEE (2017)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4146,7 +4146,7 @@ { "type": "ListItem", "element_id": "e3146a202c282ecab0d87f59d3307983", - "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548\u2013549 (2020)", + "text": "[31] Shen, Z., Zhang, K., Dell, M.: A large dataset of historical japanese documents with complex layouts. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. pp. 548–549 (2020)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4190,7 +4190,7 @@ { "type": "ListItem", "element_id": "7937fc115bcbbc8c08640587fa5ed827", - "text": "[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720\u2013725. IEEE (2019)", + "text": "[33] Studer, L., Alberti, M., Pondenkandath, V., Goktepe, P., Kolonko, T., Fischer, A., Liwicki, M., Ingold, R.: A comprehensive study of imagenet pre-training for historical document image analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 720–725. IEEE (2019)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4212,7 +4212,7 @@ { "type": "ListItem", "element_id": "881f67b82dccc13eaf96e912750c0318", - "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface\u2019s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)", + "text": "[34] Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., Funtowicz, M., et al.: Huggingface’s transformers: State-of- the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)", "metadata": { "filetype": "application/pdf", "languages": [ @@ -4300,7 +4300,7 @@ { "type": "ListItem", "element_id": "3ac304a6df305ec0a0bb9079795b6c2e", - "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for doc- ument layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015\u20131022. IEEE (Sep 2019). https://doi.org/10.1109/ICDAR.2019.00166", + "text": "[38] Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for doc- ument layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015–1022. IEEE (Sep 2019). https://doi.org/10.1109/ICDAR.2019.00166", "metadata": { "filetype": "application/pdf", "languages": [ diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json b/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json index 6ef9f4eb4b..df9b68e769 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file/UDHR_first_article_all.txt.json @@ -23,7 +23,7 @@ { "type": "UncategorizedText", "element_id": "f84bbc479d5bebf6b98c016e14d666d1", - "text": "\u00a9 1996 \u2013 2009 The Office of the High Commissioner for Human Rights", + "text": "© 1996 – 2009 The Office of the High Commissioner for Human Rights", "metadata": { "languages": [ "eng" @@ -108,7 +108,7 @@ { "type": "Title", "element_id": "84ce1bd66b09ce990ee385a04144822e", - "text": "\u662f\u4eba\u90fd\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u5f1f\u5144\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "是人都生而自由,在尊严和权利上一律平等。他们赋有理性和良心,并应以弟兄关系的精神相对待。", "metadata": { "languages": [ "zho" @@ -266,7 +266,7 @@ { "type": "NarrativeText", "element_id": "dfabb35b82a82e16d7cb50d4de138e6f", - "text": "(Yeonbyeon) \uc0ac\ub78c\ub4e4\uc774 \uc774 \uc138\uacc4\ub85c \uc624\ub2e4\uac00 \ubaa8\ub450 \uc790\uc720\ud558\uace0, \uc874\uc5c4\uacfc \uad8c\ub9ac\uc774 \ud3c9\ub3d9\uc73c\ub85c \uc788\ub294\ub2e4, \uadf8\ub4e4 \ub9ac\uc131\uacfc \uc591\uc2ec\uc774 \uc788\ub208\uace0, \ud615\uc81c\uc758 \uc815\uc2e0\uc73c\ub85c \uc0c1\ud638\ub85c \uce58\ub8cc\ud558 \uc18c.", + "text": "(Yeonbyeon) 사람들이 이 세계로 오다가 모두 자유하고, 존엄과 권리이 평동으로 있는다, 그들 리성과 양심이 있눈고, 형제의 정신으로 상호로 치료하 소.", "metadata": { "languages": [ "kor" @@ -287,7 +287,7 @@ { "type": "NarrativeText", "element_id": "1f41b7646ca8aebc36e8f5ec392481fb", - "text": "Abkhaz \u0414\u0430\u0440\u0431\u0430\u043d\u0437\u0430\u0430\u043b\u0430\u043a \u0430\u0443\u0430\u04a9\u044b \u0434\u0448\u043e\u0443\u043f \u0438\u0445\u044b \u0434\u0430\u049b\u04d9\u0438\u04ad\u043d\u044b. \u0410\u0443\u0430\u0430 \u0437\u0435\u0433\u044c \u0437\u0438\u043d\u043b\u0435\u0438 \u043f\u0430\u0442\u0443\u043b\u0435\u0438 \u0435\u0438\u049f\u0430\u0440\u043e\u0443\u043f. \u0423\u0440\u04ad \u0438\u0440\u044b\u043c\u043e\u0443\u043f \u0430\u0445\u0448\u044b\u04a9\u0438 \u0430\u043b\u0430\u043c\u044b\u0441\u0438, \u0434\u0430\u0440\u0430 \u0434\u0430\u0440\u0430\u0433\u044c \u0430\u0435\u0448\u044c\u0435\u0438 \u0430\u0435\u0448\u044c\u0435\u0438 \u0440\u0435\u0438\u04a7\u0448 \u0435\u0438\u0437\u044b\u049f\u0430\u0437\u0430\u0440\u043e\u0443\u043f.", + "text": "Abkhaz Дарбанзаалак ауаҩы дшоуп ихы дақәиҭны. Ауаа зегь зинлеи патулеи еиҟароуп. Урҭ ирымоуп ахшыҩи аламыси, дара дарагь аешьеи аешьеи реиҧш еизыҟазароуп.", "metadata": { "languages": [ "rus" @@ -372,7 +372,7 @@ { "type": "NarrativeText", "element_id": "7691e5f9dd37d6bc38044534196c1e9f", - "text": "Adyghe \u0426\u04cf\u044b\u0444 \u043f\u0441\u0442\u044d\u0443\u0440\u0438 \u0448\u044a\u0445\u044c\u044d\u0444\u0438\u0442\u044d\u0443, \u044f\u043b\u044a\u044b\u0442\u044d\u043d\u044b\u0433\u044a\u044d\u0440\u044d \u044f\u0444\u044d\u0448\u044a\u0443\u0430\u0448\u044d\u0445\u044d\u043c\u0440\u044d\u043a\u04cf\u044d \u0437\u044d\u0444\u044d\u0434\u044d\u0443 \u043a\u044a\u0430\u043b\u044a\u0444\u044b. \u0410\u043a\u044a\u044b\u043b\u0440\u044d \u0437\u044d\u0445\u044d\u0448\u04cf\u044b\u043a\u04cf \u0433\u044a\u0443\u0430\u0437\u044d\u0440\u044d \u044f\u04cf\u044d\u0448\u044a\u044b, \u0437\u044b\u0440 \u0437\u044b\u043c \u0437\u044d\u043a\u044a\u043e\u0448 \u0437\u044d\u0445\u0430\u0448\u0406\u044d \u0430\u0437\u0444\u0430\u0433\u0443 \u0434\u044d\u043b\u044a\u044d\u0443 \u0437\u044d\u0444\u044b\u0449\u044b\u0442\u044b\u043d\u0445\u044d \u0444\u0430\u0435.", + "text": "Adyghe Цӏыф пстэури шъхьэфитэу, ялъытэныгъэрэ яфэшъуашэхэмрэкӏэ зэфэдэу къалъфы. Акъылрэ зэхэшӏыкӏ гъуазэрэ яӏэшъы, зыр зым зэкъош зэхашІэ азфагу дэлъэу зэфыщытынхэ фае.", "metadata": { "languages": [ "rus" @@ -460,7 +460,7 @@ { "type": "NarrativeText", "element_id": "20509f92f090bb4ecf694ea5b01d0921", - "text": "Aja Agbet\u0254wo ple\u014bu van\u0254 gb\u025bm\u025b ko vovo\u0256eka gbesw\u025bgbesw\u025b, s\u0254to am\u025bnyinyi ko ac\u025bwo gom\u025b; wo x\u0254n\u0254 susunywin ko jim\u025bnywi so esexwe. Wo \u0256o a w\u025b n\u0254vi \u0256a\u0256a wowo n\u0254n\u0254wo gb\u0254.", + "text": "Aja Agbetɔwo pleŋu vanɔ gbɛmɛ ko vovoɖeka gbeswɛgbeswɛ, sɔto amɛnyinyi ko acɛwo gomɛ; wo xɔnɔ susunywin ko jimɛnywi so esexwe. Wo ɖo a wɛ nɔvi ɖaɖa wowo nɔnɔwo gbɔ.", "metadata": { "languages": [ "afr", @@ -483,7 +483,7 @@ { "type": "NarrativeText", "element_id": "f6e32446c48b0755dfcf243a8142d613", - "text": "Albanian, Tosk T\u00eb gjith\u00eb njer\u00ebzit lindin t\u00eb lir\u00eb dhe t\u00eb barabart\u00eb n\u00eb dinjitet dhe n\u00eb t\u00eb drejta. Ata kan\u00eb arsye dhe nd\u00ebrgjegje dhe duhet t\u00eb sillen ndaj nj\u00ebri tjetrit me frym\u00eb v\u00ebllaz\u00ebrimi.", + "text": "Albanian, Tosk Të gjithë njerëzit lindin të lirë dhe të barabartë në dinjitet dhe në të drejta. Ata kanë arsye dhe ndërgjegje dhe duhet të sillen ndaj njëri tjetrit me frymë vëllazërimi.", "metadata": { "languages": [ "sqi" @@ -504,7 +504,7 @@ { "type": "NarrativeText", "element_id": "9a69378bfb3e4825a781de59826eff73", - "text": "Alemannisch (Elsassisch) \u00c0lli Mensche k\u00f9mme m\u00ect de gliche W\u00ecrde \u00f9n Rachte \u00f9ff d\u2019Walt. Sie h\u00e0n \u00e0lli Vern\u00f9nft \u00f9n Gew\u00ecsse \u00f9n selle m\u00ect Br\u00ecederlichkeit de \u00e0ndere gejjen\u00ecwwer h\u00e0ndle.", + "text": "Alemannisch (Elsassisch) Àlli Mensche kùmme mìt de gliche Wìrde ùn Rachte ùff d’Walt. Sie hàn àlli Vernùnft ùn Gewìsse ùn selle mìt Brìederlichkeit de àndere gejjenìwwer hàndle.", "metadata": { "languages": [ "deu" @@ -525,7 +525,7 @@ { "type": "NarrativeText", "element_id": "d5de29db1ca19f8ac33afb7049462513", - "text": "Altai, Southern \u041e\u043d\u0447\u043e \u0443\u043b\u0443\u0441 \u0430\u043a\u2010\u0458\u0430\u0440\u044b\u043a\u043a\u0430 \u0458\u0430\u0439\u044b\u043c \u043b\u0430 \u0442\u0435\u04a5\u2010\u0442\u0430\u0439 \u0442\u0430\u043f\u2010\u044d\u0440\u0438\u043a\u0442\u04f1 \u0442\u0443\u0443\u043b\u0430\u0442. \u041e\u043b\u043e\u0440 \u0441\u0430\u043d\u0430\u0430\u0443\u043a\u0430\u0430\u043b\u0443 \u043b\u0430 \u0447\u0435\u043a \u043a\u04f1\u04f1\u043d\u2010\u0442\u0430\u043f\u0442\u0443 \u0431\u043e\u043b\u0443\u043f \u0431\u04f1\u0442\u043a\u0435\u043d \u043b\u0435 \u0431\u043e\u0439\u2010\u0431\u043e\u0439\u044b\u043d \u043a\u0430\u0440\u044b\u043d\u0434\u0430\u0448 \u043a\u0438\u0440\u0435\u0437\u0438 \u043a\u04e7\u0440\u04e7\u0440 \u043b\u04e7 \u0458\u04f1\u0440\u0435\u0440 \u0443\u0447\u0443\u0440\u043b\u0443.", + "text": "Altai, Southern Ончо улус ак‐јарыкка јайым ла теҥ‐тай тап‐эриктӱ туулат. Олор санааукаалу ла чек кӱӱн‐тапту болуп бӱткен ле бой‐бойын карындаш кирези кӧрӧр лӧ јӱрер учурлу.", "metadata": { "languages": [ "rus", @@ -570,7 +570,7 @@ { "type": "NarrativeText", "element_id": "d0963c28613cf0e49ccc8378af7f29b7", - "text": "Amarakaeri Aya'da aratbut katepi' eka'ta' on'pakpo ka'dik o\u0331'ne. Nog aratbut huadak o\u0331'nepo ko\u0331nigti opudo\u0331mey huadak mo'e\u0331. Aya'da huadak eka' nopoe\u0331'dik o\u0331'ne kenpa'ti dakhuea' eka' nopoe\u0331'dik o\u0331'ne kenpa'ti ko\u0331nig huama'buytaj o 'tihuapokika' ko\u0331nigti nogo\u0331meytaj tihuapokika 'dik o\u0331'ne.", + "text": "Amarakaeri Aya'da aratbut katepi' eka'ta' on'pakpo ka'dik o̱'ne. Nog aratbut huadak o̱'nepo ko̱nigti opudo̱mey huadak mo'e̱. Aya'da huadak eka' nopoe̱'dik o̱'ne kenpa'ti dakhuea' eka' nopoe̱'dik o̱'ne kenpa'ti ko̱nig huama'buytaj o 'tihuapokika' ko̱nigti nogo̱meytaj tihuapokika 'dik o̱'ne.", "metadata": { "languages": [ "ind" @@ -612,7 +612,7 @@ { "type": "Title", "element_id": "8c8d0d9098a83b293045f03fbe07358d", - "text": "\u12e8\u1230\u12cd\u1361\u120d\u1305\u1361\u1201\u1209\u1361\u1232\u12c8\u1208\u12f5\u1361\u1290\u133b\u1293\u1361\u1260\u12ad\u1265\u122d\u1293\u1361\u1260\u1218\u1265\u1275\u121d\u1361\u12a5\u12a9\u120d\u1290\u1275\u1361\u12eb\u1208\u12cd\u1361\u1290\u12cd\u1362\u1361\u12e8\u1270\u1348\u1325\u122e\u1361\u121b\u1235\u1270\u12cb\u120d\u1293\u1361\u1215\u120a\u1293\u1361\u1235\u120b\u1208\u12cd\u1361\u12a0\u1295\u12f1\u1361\u120c\u120b\u12cd\u1295\u1361\u1260\u12c8\u1295\u12f5\u121b\u121b\u127d\u1290\u1275\u1361\u1218\u1295\u1348\u1235\u1361\u1218\u1218\u120d\u12a8\u1275\u1361\u12ed\u1308\u1263\u12cb\u120d\u1362", + "text": "የሰው፡ልጅ፡ሁሉ፡ሲወለድ፡ነጻና፡በክብርና፡በመብትም፡እኩልነት፡ያለው፡ነው።፡የተፈጥሮ፡ማስተዋልና፡ሕሊና፡ስላለው፡አንዱ፡ሌላውን፡በወንድማማችነት፡መንፈስ፡መመልከት፡ይገባዋል።", "metadata": { "filetype": "text/plain", "data_source": { @@ -675,7 +675,7 @@ { "type": "NarrativeText", "element_id": "e1a81a0e10a38df3526fc4432de66ad3", - "text": "Arabic, Standard \u064a\u0648\u0644\u062f \u062c\u0645\u064a\u0639 \u0627\u0644\u0646\u0627\u0633 \u0623\u062d\u0631\u0627\u0631\u064b\u0627 \u0645\u062a\u0633\u0627\u0648\u064a\u0646 \u0641\u064a \u0627\u0644\u0643\u0631\u0627\u0645\u0629 \u0648\u0627\u0644\u062d\u0642\u0648\u0642. \u0648\u0642\u062f \u0648\u0647\u0628\u0648\u0627 \u0639\u0642\u0644\u0627\u064b \u0648\u0636\u0645\u064a\u0631\u064b\u0627 \u0648\u0639\u0644\u064a\u0647\u0645 \u0623\u0646 \u064a\u0639\u0627\u0645\u0644 \u0628\u0639\u0636\u0647\u0645 \u0628\u0639\u0636\u064b\u0627 \u0628\u0631\u0648\u062d \u0627\u0644\u0625\u062e\u0627\u0621.", + "text": "Arabic, Standard يولد جميع الناس أحرارًا متساوين في الكرامة والحقوق. وقد وهبوا عقلاً وضميرًا وعليهم أن يعامل بعضهم بعضًا بروح الإخاء.", "metadata": { "languages": [ "ara" @@ -696,7 +696,7 @@ { "type": "UncategorizedText", "element_id": "72d099b2761f12d204f35cc85600f8dd", - "text": "Armenian \u0532\u0578\u056c\u0578\u0580 \u0574\u0561\u0580\u0564\u056b\u056f \u056e\u0576\u057e\u0578\u0582\u0574 \u0565\u0576 \u0561\u0566\u0561\u057f \u0578\u0582 \u0570\u0561\u057e\u0561\u057d\u0561\u0580 \u056b\u0580\u0565\u0576\u0581 \u0561\u0580\u056a\u0561\u0576\u0561\u057a\u0561\u057f\u057e\u0578\u0582\u0569\u0575\u0561\u0574\u0562 \u0578\u0582 \u056b\u0580\u0561\u057e\u0578\u0582\u0576\u0584\u0576\u0565\u0580\u0578\u057e\u0589 \u0546\u0580\u0561\u0576\u0584 \u0578\u0582\u0576\u0565\u0576 \u0562\u0561\u0576\u0561\u056f\u0561\u0576\u0578\u0582\u0569\u0575\u0578\u0582\u0576 \u0578\u0582 \u056d\u056b\u0572\u0573 \u0587 \u0574\u056b\u0574\u0575\u0561\u0576\u0581 \u057a\u0565\u057f\u0584 \u0567 \u0565\u0572\u0562\u0561\u0575\u0580\u0561\u0562\u0561\u0580 \u057e\u0565\u0580\u0561\u0562\u0565\u0580\u057e\u0565\u0576\u0589", + "text": "Armenian Բոլոր մարդիկ ծնվում են ազատ ու հավասար իրենց արժանապատվությամբ ու իրավունքներով։ Նրանք ունեն բանականություն ու խիղճ և միմյանց պետք է եղբայրաբար վերաբերվեն։", "metadata": { "languages": [ "est" @@ -717,7 +717,7 @@ { "type": "NarrativeText", "element_id": "38291b67d0eaef665797206e43651164", - "text": "Aromanian Tuti iats\u00e2li umineshts\u00e2 s-fac liberi shi egali la n\u00e2muzea shi-ndrepturli. Eali suntu h\u00e2rziti cu fichiri shi sinidisi shi lipseashti un cu alantu sh-si poart\u00e2 tu duhlu-a fr\u00e2ts\u00e2ljiljei.", + "text": "Aromanian Tuti iatsâli umineshtsâ s-fac liberi shi egali la nâmuzea shi-ndrepturli. Eali suntu hârziti cu fichiri shi sinidisi shi lipseashti un cu alantu sh-si poartâ tu duhlu-a frâtsâljiljei.", "metadata": { "languages": [ "ron", @@ -739,7 +739,7 @@ { "type": "NarrativeText", "element_id": "6bb51b6b82df3d4800c98e8415754489", - "text": "Ash\u00e1ninka Aquempetavacaajeita maaroni atiri. Timatsi aquenqueshirejeitantari maaroni, timatsi amejeitari, ayojeiti paitarica ocameetsati antajeitiri: te oncameetsateji intsaneapitsajeiteero itsipapee. Te oncameetsateji imperanajeitee, te oncameetsateji iroashinoncaajeitee, irointi ocameetsati aacameetsatavacaajeitea.", + "text": "Asháninka Aquempetavacaajeita maaroni atiri. Timatsi aquenqueshirejeitantari maaroni, timatsi amejeitari, ayojeiti paitarica ocameetsati antajeitiri: te oncameetsateji intsaneapitsajeiteero itsipapee. Te oncameetsateji imperanajeitee, te oncameetsateji iroashinoncaajeitee, irointi ocameetsati aacameetsatavacaajeitea.", "metadata": { "languages": [ "fin", @@ -762,7 +762,7 @@ { "type": "NarrativeText", "element_id": "ef818e559e5b9629b3da213d71f6d693", - "text": "Ash\u00e9ninka, Pichis Maaroni atziripayeeni, ovaquera intzimapaaque, eero ocantzi i\u00f1aashitacaavaitaityaari iromperanataityaari. Eejatzi oquemitari iro\u00f1aaca te apantyaaro amanitashireteri atziri ancanteri: \"Te pirjiperote eeroca, iriima irinta iriitaque \u00f1aaperori\". Eejatzi oquemitari te oncameethate intzime aparoni atziri antayetashityaarone caari ishinetaacairi pashine irantero. Tema maaroni ayotziro ampampithashirvaayeta, ayotziro tsicarica otzimayetzi cameethatatsiri anteri o tsicarica otzimi caariperotatsiri, irootaque ocovaperotantari iro\u00f1aaca entacotavacaayetya anquemitacaantanaquero arentzitavacaatyeeyaami ocaaquiini.", + "text": "Ashéninka, Pichis Maaroni atziripayeeni, ovaquera intzimapaaque, eero ocantzi iñaashitacaavaitaityaari iromperanataityaari. Eejatzi oquemitari iroñaaca te apantyaaro amanitashireteri atziri ancanteri: \"Te pirjiperote eeroca, iriima irinta iriitaque ñaaperori\". Eejatzi oquemitari te oncameethate intzime aparoni atziri antayetashityaarone caari ishinetaacairi pashine irantero. Tema maaroni ayotziro ampampithashirvaayeta, ayotziro tsicarica otzimayetzi cameethatatsiri anteri o tsicarica otzimi caariperotatsiri, irootaque ocovaperotantari iroñaaca entacotavacaayetya anquemitacaantanaquero arentzitavacaatyeeyaami ocaaquiini.", "metadata": { "languages": [ "ita", @@ -785,7 +785,7 @@ { "type": "NarrativeText", "element_id": "5cb0bb4fdc15b35295973bd4a2247bd1", - "text": "Assyrian Neo-Aramaic \u071f\u0720 \u0712\u072a\u0722\u072b\u0710 \u0712\u072a\u071d\u0720\u0717 \u071a\u0710\u072a\u0710 \u0718\u0712\u072a\u0712\u072a \u0713\u0718 \u0710\u071d\u0729\u072a\u0710 \u0718\u0719\u0715\u0729\u0710. \u0718\u0726\u071d\u072b\u071d\u0720\u0717 \u071d\u0717\u0712\u0710 \u0717\u0718\u0722\u0710 \u0718\u0710\u0722\u071d\u072c. \u0712\u0718\u0715 \u0715\u0710\u0717\u0710 \u0713\u072b\u0729\u072c\u071d \u0725\u0720 \u0710\u071a\u072a\u0722\u0710 \u0713\u072a\u0713 \u0717\u0718\u071d\u0710 \u0712\u071a\u0715 \u072a\u0718\u071a\u0710 \u0715\u0710\u071a\u0722\u0718\u072c\u0710.", + "text": "Assyrian Neo-Aramaic ܟܠ ܒܪܢܫܐ ܒܪܝܠܗ ܚܐܪܐ ܘܒܪܒܪ ܓܘ ܐܝܩܪܐ ܘܙܕܩܐ. ܘܦܝܫܝܠܗ ܝܗܒܐ ܗܘܢܐ ܘܐܢܝܬ. ܒܘܕ ܕܐܗܐ ܓܫܩܬܝ ܥܠ ܐܚܪܢܐ ܓܪܓ ܗܘܝܐ ܒܚܕ ܪܘܚܐ ܕܐܚܢܘܬܐ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -803,7 +803,7 @@ { "type": "NarrativeText", "element_id": "fc37a0c903b4ad45223fa0a367de3b9b", - "text": "Asturian Tolos seres humanos nacen llibres y iguales en dignid\u00e1 y drechos y, pola mor de la raz\u00f3n y la conciencia de so, han comportase hermaniblemente los unos colos otros.", + "text": "Asturian Tolos seres humanos nacen llibres y iguales en dignidá y drechos y, pola mor de la razón y la conciencia de so, han comportase hermaniblemente los unos colos otros.", "metadata": { "languages": [ "spa" @@ -866,7 +866,7 @@ { "type": "UncategorizedText", "element_id": "4e13c433d775a93f0bb6c40cbb2d5a03", - "text": "Aymara, Central Taqpach jaqejh khuskat u\u00f1jatat\u00e4pjhewa muna\u00f1apansa, lura\u00f1apansa, amuyasi\u00f1apansa, ukatwa jilani sullkan\u00edpjhaspas ukham u\u00f1jasipjha\u00f1apawa.", + "text": "Aymara, Central Taqpach jaqejh khuskat uñjatatäpjhewa munañapansa, lurañapansa, amuyasiñapansa, ukatwa jilani sullkanípjhaspas ukham uñjasipjhañapawa.", "metadata": { "languages": [ "swa", @@ -889,7 +889,7 @@ { "type": "NarrativeText", "element_id": "8afc3caab3e458628b6f2efdb46fc6d1", - "text": "Azerbaijani, North (Cyrillic) \u0411\u04af\u0442\u04af\u043d \u0438\u043d\u0441\u0430\u043d\u043b\u0430\u0440 \u043b\u04d9\u0458\u0430\u0433\u04d9\u0442 \u0432\u04d9 \u04bb\u04af\u0433\u0443\u0433\u043b\u0430\u0440\u044b\u043d\u0430 \u049d\u04e9\u0440\u04d9 \u0430\u0437\u0430\u0434 \u0432\u04d9 \u0431\u04d9\u0440\u0430\u0431\u04d9\u0440 \u0434\u043e\u0493\u0443\u043b\u0443\u0440\u043b\u0430\u0440. \u041e\u043d\u043b\u0430\u0440\u044b\u043d \u0448\u04af\u0443\u0440\u043b\u0430\u0440\u044b \u0432\u04d9 \u0432\u0438\u04b9\u0434\u0430\u043d\u043b\u0430\u0440\u044b \u0432\u0430\u0440 \u0432\u04d9 \u0431\u0438\u0440-\u0431\u0438\u0440\u043b\u04d9\u0440\u0438\u043d\u04d9 \u043c\u04af\u043d\u0430\u0441\u0438\u0431\u04d9\u0442\u0434\u04d9 \u0433\u0430\u0440\u0434\u0430\u0448\u043b\u044b\u0433 \u0440\u0443\u04bb\u0443\u043d\u0434\u0430 \u0434\u0430\u0432\u0440\u0430\u043d\u043c\u0430\u043b\u044b\u0434\u044b\u0440\u043b\u0430\u0440.", + "text": "Azerbaijani, North (Cyrillic) Бүтүн инсанлар ләјагәт вә һүгугларына ҝөрә азад вә бәрабәр доғулурлар. Онларын шүурлары вә виҹданлары вар вә бир-бирләринә мүнасибәтдә гардашлыг руһунда давранмалыдырлар.", "metadata": { "languages": [ "rus", @@ -911,7 +911,7 @@ { "type": "NarrativeText", "element_id": "6d9f8766b1812e209f1a59654443299c", - "text": "Azerbaijani, North (Latin) B\u00fct\u00fcn insanlar l\u0259yaq\u0259t v\u0259 h\u00fcquqlar\u0131na g\u00f6r\u0259 azad v\u0259 b\u0259rab\u0259r do\u011fulurlar. Onlar\u0131n \u015f\u00fcurlar\u0131 v\u0259 vicdanlar\u0131 var v\u0259 bir-birl\u0259rin\u0259 m\u00fcnasib\u0259td\u0259 qarda\u015fl\u0131q ruhunda davranmal\u0131d\u0131rlar.", + "text": "Azerbaijani, North (Latin) Bütün insanlar ləyaqət və hüquqlarına görə azad və bərabər doğulurlar. Onların şüurları və vicdanları var və bir-birlərinə münasibətdə qardaşlıq ruhunda davranmalıdırlar.", "metadata": { "languages": [ "tur" @@ -932,7 +932,7 @@ { "type": "NarrativeText", "element_id": "3681d23b771b9cf26263ab194af3430d", - "text": "Baatonum Ba t\u0254mbu kpuro marawa ba tii m\u0254, ba n\u025b, girima ka saria s\u0254\u0254. Ba ra bwisiku, ba dasabu m\u0254, ma n weene ba n waasin\u025b m\u025brobisiru s\u0254\u0254.", + "text": "Baatonum Ba tɔmbu kpuro marawa ba tii mɔ, ba nɛ, girima ka saria sɔɔ. Ba ra bwisiku, ba dasabu mɔ, ma n weene ba n waasinɛ mɛrobisiru sɔɔ.", "metadata": { "languages": [ "som", @@ -975,7 +975,7 @@ { "type": "NarrativeText", "element_id": "394114d333ed34e0add89b5e9079d474", - "text": "Bamanankan Hadamaden b\u025b\u025b danmak\u025b\u0272\u025bnen b\u025b bange, danbe ni josira la. Hakili ni taasi b\u2019u b\u025b\u025b la, wa u ka kan ka baden\u0272asira de waleya u ni \u0272\u0254g\u0254n c\u025b.", + "text": "Bamanankan Hadamaden bɛɛ danmakɛɲɛnen bɛ bange, danbe ni josira la. Hakili ni taasi b’u bɛɛ la, wa u ka kan ka badenɲasira de waleya u ni ɲɔgɔn cɛ.", "metadata": { "languages": [ "som", @@ -998,7 +998,7 @@ { "type": "NarrativeText", "element_id": "31e2922fd7a67918fa2a09744965a970", - "text": "Bamun Pe na\u0302 mve\u0301 gu\u0301 puen nyu\u0308tu po\u0302 te mbe ku\u0301 ghe\u0301t ngam pua ngu\u0301enengu\u0301e mbe te wu\u0302me nsebe pua pa mfe\u0301e\u0301ke\u0302t. Pen a\u0302 ntu\u0301m te mbe ku\u0301 rem ngam pua fabshe ngam, a nshi nji\u0302r\u2019ap ne yi nsha\u0302ne nge\u0301tne nga shap po\u0302 te wupme ponta\u0302.", + "text": "Bamun Pe nâ mvé gú puen nyütu pô te mbe kú ghét ngam pua ngúenengúe mbe te wûme nsebe pua pa mféékêt. Pen â ntúm te mbe kú rem ngam pua fabshe ngam, a nshi njîr’ap ne yi nshâne ngétne nga shap pô te wupme pontâ.", "metadata": { "languages": [ "sqi" @@ -1019,7 +1019,7 @@ { "type": "NarrativeText", "element_id": "c5815bd56d9b0f7114cfa825514698ca", - "text": "Baoul\u00e9 Sran mun be ngba, k\u025b be wu be \u0254, be ngba be s\u025b, f\u0254ndi nun, sran-mmala nun. Be si akundanbu, be si su \u0254 fata k\u025b sran mun be tran'n, be tran aniaan nun tranl\u025b.", + "text": "Baoulé Sran mun be ngba, kɛ be wu be ɔ, be ngba be sɛ, fɔndi nun, sran-mmala nun. Be si akundanbu, be si su ɔ fata kɛ sran mun be tran'n, be tran aniaan nun tranlɛ.", "metadata": { "languages": [ "ind" @@ -1040,7 +1040,7 @@ { "type": "NarrativeText", "element_id": "f937bd218ac832a520fee7be14b4e89c", - "text": "Bari \u014autu li\u014b a yu\u014bwe kana, jojo i to\u010firi ko \u010fekesi ko ti se tokitaki ko \u2018b\u00f6rik ko mul\u00f6k\u00f6tyo lo tolu\u014baseran. Se a \u010foka ko denet na kulya na\u2019but ko narok.", + "text": "Bari Ŋutu liŋ a yuŋwe kana, jojo i toďiri ko ďekesi ko ti se tokitaki ko ‘börik ko mulökötyo lo toluŋaseran. Se a ďoka ko denet na kulya na’but ko narok.", "metadata": { "languages": [ "hrv", @@ -1085,7 +1085,7 @@ { "type": "NarrativeText", "element_id": "5ce714cfa1def0c0d951bf7bff485500", - "text": "Belarusan \u0423\u0441\u0435 \u043b\u044e\u0434\u0437\u0456 \u043d\u0430\u0440\u0430\u0434\u0436\u0430\u044e\u0446\u0446\u0430 \u0441\u0432\u0430\u0431\u043e\u0434\u043d\u044b\u043c\u0456 \u0456 \u0440\u043e\u045e\u043d\u044b\u043c\u0456 \u045e \u0441\u0432\u0430\u0451\u0439 \u0433\u043e\u0434\u043d\u0430\u0441\u0446\u0456 \u0456 \u043f\u0440\u0430\u0432\u0430\u0445. \u042f\u043d\u044b \u043d\u0430\u0434\u0437\u0435\u043b\u0435\u043d\u044b \u0440\u043e\u0437\u0443\u043c\u0430\u043c \u0456 \u0441\u0443\u043c\u043b\u0435\u043d\u043d\u0435\u043c \u0456 \u043f\u0430\u0432\u0456\u043d\u043d\u044b \u0441\u0442\u0430\u0432\u0456\u0446\u0446\u0430 \u0430\u0434\u0437\u0456\u043d \u0434\u0430 \u0430\u0434\u043d\u0430\u0433\u043e \u045e \u0434\u0443\u0445\u0443 \u0431\u0440\u0430\u0446\u0442\u0432\u0430.", + "text": "Belarusan Усе людзі нараджаюцца свабоднымі і роўнымі ў сваёй годнасці і правах. Яны надзелены розумам і сумленнем і павінны ставіцца адзін да аднаго ў духу брацтва.", "metadata": { "languages": [ "ukr", @@ -1128,7 +1128,7 @@ { "type": "UncategorizedText", "element_id": "bb5acaee87121a890d36cb7afd3ad15a", - "text": "Bengali \u09b8\u09ae\u09b8\u09cd\u09a4 \u09ae\u09be\u09a8\u09c1\u09b7 \u09b8\u09cd\u09ac\u09be\u09a7\u09c0\u09a8\u09ad\u09be\u09ac\u09c7 \u09b8\u09ae\u09be\u09a8 \u09ae\u09b0\u09cd\u09af\u09be\u09a6\u09be \u098f\u09ac\u0982 \u0985\u09a7\u09bf\u0995\u09be\u09b0 \u09a8\u09bf\u09af\u09bc\u09c7 \u099c\u09a8\u09cd\u09ae\u0997\u09cd\u09b0\u09b9\u09a3 \u0995\u09b0\u09c7\u0964 \u09a4\u09be\u0981\u09a6\u09c7\u09b0 \u09ac\u09bf\u09ac\u09c7\u0995 \u098f\u09ac\u0982 \u09ac\u09c1\u09a6\u09cd\u09a7\u09bf \u0986\u099b\u09c7; \u09b8\u09c1\u09a4\u09b0\u09be\u0982 \u09b8\u0995\u09b2\u09c7\u09b0\u0987 \u098f\u0995\u09c7 \u0985\u09aa\u09b0\u09c7\u09b0 \u09aa\u09cd\u09b0\u09a4\u09bf \u09ad\u09cd\u09b0\u09be\u09a4\u09c3\u09a4\u09cd\u09ac\u09b8\u09c1\u09b2\u09ad \u09ae\u09a8\u09cb\u09ad\u09be\u09ac \u09a8\u09bf\u09df\u09c7 \u0986\u099a\u09b0\u09a3 \u0995\u09b0\u09be \u0989\u099a\u09bf\u09a4\u0964", + "text": "Bengali সমস্ত মানুষ স্বাধীনভাবে সমান মর্যাদা এবং অধিকার নিয়ে জন্মগ্রহণ করে। তাঁদের বিবেক এবং বুদ্ধি আছে; সুতরাং সকলেরই একে অপরের প্রতি ভ্রাতৃত্বসুলভ মনোভাব নিয়ে আচরণ করা উচিত।", "metadata": { "languages": [ "ben" @@ -1149,7 +1149,7 @@ { "type": "UncategorizedText", "element_id": "d5919948b12c6b7e2c5179487170dd51", - "text": "Bhojpuri \u0938\u092c\u0939\u093f \u0932\u094b\u0915\u093e\u0928\u093f \u0906\u091c\u093e\u0926\u0947 \u091c\u092e\u094d\u092e\u0947\u0932\u093e \u0906\u0913\u0930 \u0913\u0916\u093f\u0928\u093f\u092f\u094b \u0915\u0947 \u092c\u0930\u093e\u092c\u0930 \u0938\u092e\u094d\u092e\u093e\u0928 \u0906\u0913\u0930 \u0905\u0927\u093f\u0915\u093e\u0930 \u092a\u094d\u0930\u093e\u092a\u094d\u0924 \u0939\u0935\u0947\u0964 \u0913\u0916\u093f\u0928\u093f\u092f\u094b \u0915\u0947 \u092a\u093e\u0938 \u0938\u092e\u091d-\u092c\u0942\u091d \u0906\u0913\u0930 \u0905\u0902\u0924:\u0915\u0930\u0923 \u0915\u0947 \u0906\u0935\u093e\u091c \u0939\u094b\u0916\u0924\u093e \u0906\u0913\u0930 \u0939\u0941\u0928\u0915\u094b \u0915\u0947 \u0926\u094b\u0938\u0930\u093e \u0915\u0947 \u0938\u093e\u0925 \u092d\u093e\u0908\u091a\u093e\u0930\u093e \u0915\u0947 \u092c\u0947\u0935\u0939\u093e\u0930 \u0915\u0930\u0947 \u0915\u0947 \u0939\u094b\u0916\u0932\u093e\u0964", + "text": "Bhojpuri सबहि लोकानि आजादे जम्मेला आओर ओखिनियो के बराबर सम्मान आओर अधिकार प्राप्त हवे। ओखिनियो के पास समझ-बूझ आओर अंत:करण के आवाज होखता आओर हुनको के दोसरा के साथ भाईचारा के बेवहार करे के होखला।", "metadata": { "languages": [ "hin" @@ -1214,7 +1214,7 @@ { "type": "NarrativeText", "element_id": "09176e19ded6b0ff879ead0799cc2302", - "text": "Bora P\u00e1meere \u00ed\u00ed\u00f1\u00faj\u0268ri me\u00edjcyame ts\u00e1 m\u00fah\u00f3j\u0268\u0301s\u0268\u0301 pa\u00f1\u00e9 \u0268\u0301cub\u00e1hr\u00e1d\u00fa me\u00edjcy\u00e1\u00edtyur\u00f3ne. P\u00e1meere tsahd\u00far\u00e9 im\u00ed me\u00edjcyame mew\u00e1jy\u00fajcats\u00ed\u00f1e m\u00e9p\u0268\u0301\u00e1\u00e1b\u00f3jcats\u00ediy\u00e1 tsaat\u00e9k\u00e9 \u00e9hd\u0268\u0301\u0208\u0301v\u00e1llet\u00fam\u00e9 \u00e9hne m\u00fau m\u00e9pa\u00f1\u00e9t\u00fa\u00e9n\u00e9 nahb\u00e9muma me\u00edjcyadu.", + "text": "Bora Pámeere ííñújɨri meíjcyame tsá múhójɨ́sɨ́ pañé ɨ́cubáhrádú meíjcyáítyuróne. Pámeere tsahdúré imí meíjcyame mewájyújcatsíñe mépɨ́áábójcatsíiyá tsaatéké éhdɨ́Ȉ́válletúmé éhne múu mépañétúéné nahbémuma meíjcyadu.", "metadata": { "languages": [ "hun" @@ -1235,7 +1235,7 @@ { "type": "NarrativeText", "element_id": "5e3ff47fa6202cd3f10a179ea2b898e3", - "text": "Bosnian (Cyrillic) \u0421\u0432\u0430 \u0459\u0443\u0434\u0441\u043a\u0430 \u0431\u0438\u045b\u0430 \u0440\u0430\u045b\u0430\u0458\u0443 \u0441\u0435 \u0441\u043b\u043e\u0431\u043e\u0434\u043d\u0430 \u0438 \u0458\u0435\u0434\u043d\u0430\u043a\u0430 \u0443 \u0434\u043e\u0441\u0442\u043e\u0458\u0430\u043d\u0441\u0442\u0432\u0443 \u0438 \u043f\u0440\u0430\u0432\u0438\u043c\u0430. \u041e\u043d\u0430 \u0441\u0443 \u043e\u0431\u0434\u0430\u0440\u0435\u043d\u0430 \u0440\u0430\u0437\u0443\u043c\u043e\u043c \u0438 \u0441\u0432\u0438\u0458\u0435\u0448\u045b\u0443 \u0438 \u0442\u0440\u0435\u0431\u0430 \u0434\u0430 \u0458\u0435\u0434\u043d\u043e \u043f\u0440\u0435\u043c\u0430 \u0434\u0440\u0443\u0433\u043e\u043c\u0435 \u043f\u043e\u0441\u0442\u0443\u043f\u0430\u0458\u0443 \u0443 \u0434\u0443\u0445\u0443 \u0431\u0440\u0430\u0442\u0441\u0442\u0432\u0430.", + "text": "Bosnian (Cyrillic) Сва људска бића раћају се слободна и једнака у достојанству и правима. Она су обдарена разумом и свијешћу и треба да једно према другоме поступају у духу братства.", "metadata": { "languages": [ "mkd" @@ -1256,7 +1256,7 @@ { "type": "NarrativeText", "element_id": "8918cf337af35db75c0b7e3a98572814", - "text": "Bosnian (Latin) Sva ljudska bi\u0107a ra\u0111aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i svije\u0161\u0107u i treba da jedno prema drugome postupaju u duhu bratstva.", + "text": "Bosnian (Latin) Sva ljudska bića rađaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sviješću i treba da jedno prema drugome postupaju u duhu bratstva.", "metadata": { "languages": [ "hrv" @@ -1277,7 +1277,7 @@ { "type": "NarrativeText", "element_id": "4f74a58266d23d68a787e2a91434a97d", - "text": "Breton Dieub ha par en o dellezegezh hag o gwirio\u00f9 eo ganet an holl dud. Poell ha skiant zo dezho ha dleout a reont beva\u00f1 an eil gant egile en ur spered a genvreudeuriezh.", + "text": "Breton Dieub ha par en o dellezegezh hag o gwirioù eo ganet an holl dud. Poell ha skiant zo dezho ha dleout a reont bevañ an eil gant egile en ur spered a genvreudeuriezh.", "metadata": { "languages": [ "nld", @@ -1321,7 +1321,7 @@ { "type": "NarrativeText", "element_id": "24a3cf3bd02d17e2f2b065bab51c8e70", - "text": "Bulgarian \u0412\u0441\u0438\u0447\u043a\u0438 \u0445\u043e\u0440\u0430 \u0441\u0435 \u0440\u0430\u0436\u0434\u0430\u0442 \u0441\u0432\u043e\u0431\u043e\u0434\u043d\u0438 \u0438 \u0440\u0430\u0432\u043d\u0438 \u043f\u043e \u0434\u043e\u0441\u0442\u043e\u0439\u043d\u0441\u0442\u0432\u043e \u0438 \u043f\u0440\u0430\u0432\u0430. \u0422\u0435 \u0441\u0430 \u043d\u0430\u0434\u0430\u0440\u0435\u043d\u0438 \u0441 \u0440\u0430\u0437\u0443\u043c \u0438 \u0441\u044a\u0432\u0435\u0441\u0442 \u0438 \u0441\u043b\u0435\u0434\u0432\u0430 \u0434\u0430 \u0441\u0435 \u043e\u0442\u043d\u0430\u0441\u044f\u0442 \u043f\u043e\u043c\u0435\u0436\u0434\u0443 \u0441\u0438 \u0432 \u0434\u0443\u0445 \u043d\u0430 \u0431\u0440\u0430\u0442\u0441\u0442\u0432\u043e.", + "text": "Bulgarian Всички хора се раждат свободни и равни по достойнство и права. Те са надарени с разум и съвест и следва да се отнасят помежду си в дух на братство.", "metadata": { "languages": [ "bul" @@ -1342,7 +1342,7 @@ { "type": "NarrativeText", "element_id": "61589cb2ca0346e6af7f49a73b4125b3", - "text": "Bulu Abiali bod bese, tege ai sesala, bene etie dzia a mis memvende y'enyi\u00f1, dzom dzia etu fili nk\u00f3b\u00f3, fili ntsogan, fili mboan. Ve abiali te, mod ose ayem dze ene abe, dze ene mbe\u00f1 asu e mod mbog antoa ai mfi na enyi\u00f1 ewulu mezen mene sosoo.", + "text": "Bulu Abiali bod bese, tege ai sesala, bene etie dzia a mis memvende y'enyiñ, dzom dzia etu fili nkóbó, fili ntsogan, fili mboan. Ve abiali te, mod ose ayem dze ene abe, dze ene mbeñ asu e mod mbog antoa ai mfi na enyiñ ewulu mezen mene sosoo.", "metadata": { "languages": [ "ron", @@ -1366,7 +1366,7 @@ { "type": "UncategorizedText", "element_id": "6dbacafdbc68b6ba0689b2d27b2ede49", - "text": "Burmese \u101c\u1030\u1010\u102d\u102f\u1004\u103a\u1038\u101e\u100a\u103a \u1010\u1030\u100a\u102e \u101c\u103d\u1010\u103a\u101c\u1015\u103a\u101e\u1031\u102c \u1002\u102f\u100f\u103a\u101e\u102d\u1000\u1039\u1001\u102c\u1016\u103c\u1004\u1037\u103a \u101c\u100a\u103a\u1038\u1000\u1031\u102c\u1004\u103a\u1038\u104a \u1010\u1030\u100a\u102e\u101c\u103d\u1010\u103a\u101c\u1015\u103a\u101e\u1031\u102c \u1021\u1001\u103d\u1004\u1037\u103a\u1021\u101b\u1031\u1038\u1019\u103b\u102c\u1038\u1016\u103c\u1004\u1037\u103a \u101c\u100a\u103a\u1038\u1000\u1031\u102c\u1004\u103a\u1038\u104a \u1019\u103d\u1031\u1038\u1016\u103d\u102c\u1038\u101c\u102c\u101e\u1030\u1019\u103b\u102c\u1038 \u1016\u103c\u1005\u103a\u101e\u100a\u103a\u104b \u1011\u102d\u102f\u101e\u1030\u1010\u102d\u102f\u1037\u104c \u1015\u102d\u102f\u1004\u103a\u1038\u1001\u103c\u102c\u1038 \u101d\u1031\u1016\u1014\u103a\u1010\u1010\u103a\u101e\u1031\u102c \u1009\u102c\u100f\u103a\u1014\u103e\u1004\u1037\u103a \u1000\u103b\u1004\u1037\u103a\u101d\u1010\u103a \u101e\u102d\u1010\u1010\u103a\u101e\u1031\u102c \u1005\u102d\u1010\u103a\u1010\u102d\u102f\u1037\u101b\u103e\u102d\u1000\u103c\u104d \u1011\u102d\u102f\u101e\u1030\u1010\u102d\u102f\u1037\u101e\u100a\u103a \u1021\u1001\u103b\u1004\u103a\u1038\u1001\u103b\u1004\u103a\u1038 \u1019\u1031\u1010\u1039\u1010\u102c\u1011\u102c\u1038\u104d \u1006\u1000\u103a\u1006\u1036\u1000\u103b\u1004\u1037\u103a\u101e\u102f\u1036\u1038\u101e\u1004\u1037\u103a\u104f\u104b", + "text": "Burmese လူတိုင်းသည် တူညီ လွတ်လပ်သော ဂုဏ်သိက္ခာဖြင့် လည်းကောင်း၊ တူညီလွတ်လပ်သော အခွင့်အရေးများဖြင့် လည်းကောင်း၊ မွေးဖွားလာသူများ ဖြစ်သည်။ ထိုသူတို့၌ ပိုင်းခြား ဝေဖန်တတ်သော ဉာဏ်နှင့် ကျင့်ဝတ် သိတတ်သော စိတ်တို့ရှိကြ၍ ထိုသူတို့သည် အချင်းချင်း မေတ္တာထား၍ ဆက်ဆံကျင့်သုံးသင့်၏။", "metadata": { "filetype": "text/plain", "data_source": { @@ -1384,7 +1384,7 @@ { "type": "NarrativeText", "element_id": "7b5c1459fc45a2821c0d05cd98c1996f", - "text": "Bushi \u0181inadamu djabi nitirahinyi an-nafasi, reu bokeu mira\u014ba amin\u2019ni usheu ndreka haki. Reu teraka ndreka \u00e3kili ndreka hikima, amin\u2019ni zenyi, reu nikulazimu nisi twera\u014ba nin-fihava\u014ba reu sambi reu.", + "text": "Bushi Ɓinadamu djabi nitirahinyi an-nafasi, reu bokeu miraŋa amin’ni usheu ndreka haki. Reu teraka ndreka ãkili ndreka hikima, amin’ni zenyi, reu nikulazimu nisi tweraŋa nin-fihavaŋa reu sambi reu.", "metadata": { "languages": [ "swa" @@ -1448,7 +1448,7 @@ { "type": "NarrativeText", "element_id": "296f3e08ce32c544b7ce3922abf32c6c", - "text": "Cashibo-Cacataibo Ui uni cara 'iti ic\u00eb axbi ca b\u00ebtsi unib\u00eb gobiernon\u00ebn isc\u00ebx s\u00ebn\u00ebn it\u00ed ic\u00ebn. Ui cara ain tita ain papa 'iaxa quixun sinanquinma ca gobiernon\u00ebn sinanc\u00ebx ax b\u00ebtsib\u00eb s\u00ebn\u00ebn 'ic\u00ebn. Camaxunbi ca sinanti 'unanin. Camaxunbi ca a\u00f1u \u00f1u ati cara as\u00e1bi 'ic\u00ebn, a\u00f1u \u00f1u 'ati cara 'aisama 'ic\u00eb quixun 'unanti 'ic\u00ebn. Usa 'ain ca camaxbi ain xuc\u00ebnb\u00eb 'ic\u00ebsaribiti nuiananti 'ic\u00ebn.", + "text": "Cashibo-Cacataibo Ui uni cara 'iti icë axbi ca bëtsi unibë gobiernonën iscëx sënën ití icën. Ui cara ain tita ain papa 'iaxa quixun sinanquinma ca gobiernonën sinancëx ax bëtsibë sënën 'icën. Camaxunbi ca sinanti 'unanin. Camaxunbi ca añu ñu ati cara asábi 'icën, añu ñu 'ati cara 'aisama 'icë quixun 'unanti 'icën. Usa 'ain ca camaxbi ain xucënbë 'icësaribiti nuiananti 'icën.", "metadata": { "languages": [ "sqi", @@ -1492,7 +1492,7 @@ { "type": "NarrativeText", "element_id": "75c025da4f4c95d2f428dc459b739bef", - "text": "Catalan-Valencian-Balear Tots els \u00e9ssers humans neixen lliures i iguals en dignitat i en drets. S\u00f3n dotats de ra\u00f3 i de consci\u00e8ncia, i han de comportar-se fraternalment els uns amb els altres.", + "text": "Catalan-Valencian-Balear Tots els éssers humans neixen lliures i iguals en dignitat i en drets. Són dotats de raó i de consciència, i han de comportar-se fraternalment els uns amb els altres.", "metadata": { "languages": [ "cat" @@ -1534,7 +1534,7 @@ { "type": "NarrativeText", "element_id": "346a128271cb055071a9b9d4548d0488", - "text": "Chachi Naaju chachilla bain mu' chachilla bain na kayatu tichiba bulla jutyu naakendya'ba kenu deechu taa na kayamu deju, tsenminya,naaju ju\u00f1u bain ne tsaave ti', uukavinu jutyu naa, tideechu juuchi bain, mubain mubain tsaren dejuve, tsenmin shilli pensangenu pude deju'. mitya, tsenr)1in ura' kendu bain ura' kendyu' bain mide' mitya muba mu bain veta' veta' ura' keewaawaa kenuu dejuve.", + "text": "Chachi Naaju chachilla bain mu' chachilla bain na kayatu tichiba bulla jutyu naakendya'ba kenu deechu taa na kayamu deju, tsenminya,naaju juñu bain ne tsaave ti', uukavinu jutyu naa, tideechu juuchi bain, mubain mubain tsaren dejuve, tsenmin shilli pensangenu pude deju'. mitya, tsenr)1in ura' kendu bain ura' kendyu' bain mide' mitya muba mu bain veta' veta' ura' keewaawaa kenuu dejuve.", "metadata": { "languages": [ "ind", @@ -1556,7 +1556,7 @@ { "type": "UncategorizedText", "element_id": "0b1ae7cf56e3557ef9acecc99806172b", - "text": "Chakma \ud804\udd1d\ud804\udd2c\ud804\udd07\ud804\udd34 \ud804\udd1f\ud804\udd1a\ud804\udd2a\ud804\udd0c\ud804\udd34 \ud804\udd1a\ud804\udd28\ud804\udd22\ud804\udd28\ud804\udd1e\ud804\udd28\ud804\udd23\ud804\udd28 \ud804\udd25\ud804\udd27\ud804\udd01 \ud804\udd03\ud804\udd28\ud804\udd0c\ud804\udd34\ud804\udd0e\ud804\udd2e\ud804\udd16\ud804\udd34 \ud804\udd03\ud804\udd33\ud804\udd03 \ud804\udd03\ud804\udd07\ud804\udd34\ud804\udd07\ud804\udd25\ud804\udd01 \ud804\udd1a\ud804\udd28\ud804\udd1a\ud804\udd2c\ud804\udd2d \ud804\udd0e\ud804\udd27\ud804\udd1a\ud804\udd34\ud804\udd1f\ud804\udd1a\ud804\udd34\ud804\udd41 \ud804\udd16\ud804\udd22\ud804\udd22\ud804\udd34 \ud804\udd03\ud804\udd2c\ud804\udd18 \ud804\udd03\ud804\udd33\ud804\udd03 \ud804\udd1d\ud804\udd2a\ud804\udd16\ud804\udd34\ud804\udd19\ud804\udd28 \ud804\udd03\ud804\udd0a\ud804\udd2c; \ud804\udd25\ud804\udd2c\ud804\udd1a\ud804\udd27\ud804\udd16\ud804\udd33\ud804\udd20\ud804\udd34 \ud804\udd1d\ud804\udd2c\ud804\udd07\ud804\udd34\ud804\udd05\ud804\udd1a\ud804\udd27\ud804\udd22\ud804\udd34 \ud804\udd03\ud804\udd2c\ud804\udd07\ud804\udd34\ud804\udd0e\ud804\udd27\ud804\udd1a\ud804\udd34 \ud804\udd03\ud804\udd22\ud804\udd2c\ud804\udd07\ud804\udd34 \ud804\udd0e\ud804\udd27\ud804\udd1a\ud804\udd27\ud804\udd22\ud804\udd34 \ud804\udd1b\ud804\udd33\ud804\udd22\ud804\udd27\ud804\udd16\ud804\udd28 \ud804\udd09\ud804\udd27\ud804\udd1f\ud804\udd34 \ud804\udd18\ud804\udd2e\ud804\udd23\ud804\udd34 \ud804\udd0c\ud804\udd28\ud804\udd18\ud804\udd33\ud804\udd20\ud804\udd2c \ud804\udd1a\ud804\udd28\ud804\udd1a\ud804\udd2c\ud804\udd2d \ud804\udd0c\ud804\udd27\ud804\udd23\ud804\udd1a \ud804\udd05\ud804\udd2a\ud804\udd0c\ud804\udd28\ud804\udd16\ud804\udd34\ud804\udd41", + "text": "Chakma 𑄝𑄬𑄇𑄴 𑄟𑄚𑄪𑄌𑄴 𑄚𑄨𑄢𑄨𑄞𑄨𑄣𑄨 𑄥𑄧𑄁 𑄃𑄨𑄌𑄴𑄎𑄮𑄖𑄴 𑄃𑄳𑄃 𑄃𑄇𑄴𑄇𑄥𑄁 𑄚𑄨𑄚𑄬𑄭 𑄎𑄧𑄚𑄴𑄟𑄚𑄴𑅁 𑄖𑄢𑄢𑄴 𑄃𑄬𑄘 𑄃𑄳𑄃 𑄝𑄪𑄖𑄴𑄙𑄨 𑄃𑄊𑄬; 𑄥𑄬𑄚𑄧𑄖𑄳𑄠𑄴 𑄝𑄬𑄇𑄴𑄅𑄚𑄧𑄢𑄴 𑄃𑄬𑄇𑄴𑄎𑄧𑄚𑄴 𑄃𑄢𑄬𑄇𑄴 𑄎𑄧𑄚𑄧𑄢𑄴 𑄛𑄳𑄢𑄧𑄖𑄨 𑄉𑄧𑄟𑄴 𑄘𑄮𑄣𑄴 𑄌𑄨𑄘𑄳𑄠𑄬 𑄚𑄨𑄚𑄬𑄭 𑄌𑄧𑄣𑄚 𑄅𑄪𑄌𑄨𑄖𑄴𑅁", "metadata": { "filetype": "text/plain", "data_source": { @@ -1596,7 +1596,7 @@ { "type": "NarrativeText", "element_id": "87e7fb3e75a3a124c8e4bce8573a5dd1", - "text": "Chayahuita Ya'ipi piyapinpoa' capini noya ninosorocaso' ya'hu\u00ebrin. Ya'ipinpoa' yonquir\u00ebhua'. Noya nicacaso' nitot\u00ebr\u00ebhua'. Napoaton iyanpoa pochin ninosorocaso' ya 'hu\u00ebrin.", + "text": "Chayahuita Ya'ipi piyapinpoa' capini noya ninosorocaso' ya'huërin. Ya'ipinpoa' yonquirëhua'. Noya nicacaso' nitotërëhua'. Napoaton iyanpoa pochin ninosorocaso' ya 'huërin.", "metadata": { "languages": [ "tgl", @@ -1619,7 +1619,7 @@ { "type": "NarrativeText", "element_id": "03ea2a4dd341c6cdd4c3ddd814721290", - "text": "Cherokee (cased) \u13c2\uab76\uaba3 \uab70\uab92\u13fc\uabbb \uab74\uab8e\uaba5\uab95\uab72 \uab74\uab8e\uabaa\uaba3\uab84\uaba3 \uab70\uab84 \uab71\uabb7\uab83\uab7d\uab99 \uab8e\uab72 \uab70\uabb2\uab99\uaba9\uaba7 \uab70\uab84 \uab74\uab92\uab82 \uab72\u13fb\uab8e\uabab\uaba7\uab72. \u13be\uab9d\uab79\uab8e\uab93 \uab74\uab85\uab9d\uab7a\uab88\uaba4\uab95\uab79 \uab74\uabb0\uabbf\uab9d\uaba7 \uab95\u13f8\uab85\uabab\uab79 \uab70\uab84 \uab70\uaba3\uab95\uaba6\uabaf\uaba3\uab9d\uaba7 \uab70\uab84 \uab71\uab85\uab9d\uaba7 \uab9f\u13fc\uabbb\uab7d \uab92\uabaa\uab8e\uaba3\uabab\uab8e\uaba5\uab7c\uab79 \uab8e \uaba7\uab8e\uaba3\uab95\uabaf \uab70\uaba3\uab95\uaba9 \uab7c\uaba7.", + "text": "Cherokee (cased) Ꮒꭶꮣ ꭰꮒᏼꮻ ꭴꮎꮥꮕꭲ ꭴꮎꮪꮣꮄꮣ ꭰꮄ ꭱꮷꮃꭽꮙ ꮎꭲ ꭰꮲꮙꮩꮧ ꭰꮄ ꭴꮒꮂ ꭲᏻꮎꮫꮧꭲ. Ꮎꮝꭹꮎꮓ ꭴꮅꮝꭺꮈꮤꮕꭹ ꭴꮰꮿꮝꮧ ꮕᏸꮅꮫꭹ ꭰꮄ ꭰꮣꮕꮦꮯꮣꮝꮧ ꭰꮄ ꭱꮅꮝꮧ ꮟᏼꮻꭽ ꮒꮪꮎꮣꮫꮎꮥꭼꭹ ꮎ ꮧꮎꮣꮕꮯ ꭰꮣꮕꮩ ꭼꮧ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -1637,7 +1637,7 @@ { "type": "NarrativeText", "element_id": "09009508dba31db1f130bf24d409614e", - "text": "Cherokee (uppercase) \u13c2\u13a6\u13d3 \u13a0\u13c2\u13f4\u13eb \u13a4\u13be\u13d5\u13c5\u13a2 \u13a4\u13be\u13da\u13d3\u13b4\u13d3 \u13a0\u13b4 \u13a1\u13e7\u13b3\u13ad\u13c9 \u13be\u13a2 \u13a0\u13e2\u13c9\u13d9\u13d7 \u13a0\u13b4 \u13a4\u13c2\u13b2 \u13a2\u13f3\u13be\u13db\u13d7\u13a2. \u13be\u13cd\u13a9\u13be\u13c3 \u13a4\u13b5\u13cd\u13aa\u13b8\u13d4\u13c5\u13a9 \u13a4\u13e0\u13ef\u13cd\u13d7 \u13c5\u13f0\u13b5\u13db\u13a9 \u13a0\u13b4 \u13a0\u13d3\u13c5\u13d6\u13df\u13d3\u13cd\u13d7 \u13a0\u13b4 \u13a1\u13b5\u13cd\u13d7 \u13cf\u13f4\u13eb\u13ad \u13c2\u13da\u13be\u13d3\u13db\u13be\u13d5\u13ac\u13a9 \u13be \u13d7\u13be\u13d3\u13c5\u13df \u13a0\u13d3\u13c5\u13d9 \u13ac\u13d7.", + "text": "Cherokee (uppercase) ᏂᎦᏓ ᎠᏂᏴᏫ ᎤᎾᏕᏅᎢ ᎤᎾᏚᏓᎴᏓ ᎠᎴ ᎡᏧᎳᎭᏉ ᎾᎢ ᎠᏢᏉᏙᏗ ᎠᎴ ᎤᏂᎲ ᎢᏳᎾᏛᏗᎢ. ᎾᏍᎩᎾᏃ ᎤᎵᏍᎪᎸᏔᏅᎩ ᎤᏠᏯᏍᏗ ᏅᏰᎵᏛᎩ ᎠᎴ ᎠᏓᏅᏖᏟᏓᏍᏗ ᎠᎴ ᎡᎵᏍᏗ ᏏᏴᏫᎭ ᏂᏚᎾᏓᏛᎾᏕᎬᎩ Ꮎ ᏗᎾᏓᏅᏟ ᎠᏓᏅᏙ ᎬᏗ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -1655,7 +1655,7 @@ { "type": "NarrativeText", "element_id": "ca845e694f20fb1947def444cd1f59f9", - "text": "Chickasaw Himmaka' nittakookano hattak yokasht toksalicha'nikat ki'yo. Hattak m\u00f3\u0331makat itt\u00edllawwi b\u00edyyi'kacha nanna m\u00f3\u0331maka\u0331 ittibaachaffa'hitok.", + "text": "Chickasaw Himmaka' nittakookano hattak yokasht toksalicha'nikat ki'yo. Hattak mó̱makat ittíllawwi bíyyi'kacha nanna mó̱maka̱ ittibaachaffa'hitok.", "metadata": { "languages": [ "swa", @@ -1720,7 +1720,7 @@ { "type": "NarrativeText", "element_id": "2dc80f80340d36e85a551642585e592a", - "text": "Chin, Matu Thlangboeih he rhimomna, vanpitna, yalpona hamhmoel ka tawn thlang la cuun la ng\u2019om u. Thlanghing he athae-then paekboe thaina neh yakming thaina moeiboe ka tawn thlang la n\u2019om u dong ah khat neh khat lungvat na neh thloehlan voekhlak u thae ham om.", + "text": "Chin, Matu Thlangboeih he rhimomna, vanpitna, yalpona hamhmoel ka tawn thlang la cuun la ng’om u. Thlanghing he athae-then paekboe thaina neh yakming thaina moeiboe ka tawn thlang la n’om u dong ah khat neh khat lungvat na neh thloehlan voekhlak u thae ham om.", "metadata": { "languages": [ "tgl", @@ -1763,7 +1763,7 @@ { "type": "NarrativeText", "element_id": "66e7bb8d8db209646cecea79ecf23f89", - "text": "Chinantec, Chiltepec Lej\u0268\u0308 ni sou tsa lisia\u0331 ija\u0331a sia ikou' ne kojo\u0331 j\u00ef ne juso\u0331 ne jmo' re ju i s\u0268' jmo' n\u00f6 sala\u0331 ne sasno.", + "text": "Chinantec, Chiltepec Lejɨ̈ ni sou tsa lisia̱ ija̱a sia ikou' ne kojo̱ jï ne juso̱ ne jmo' re ju i sɨ' jmo' nö sala̱ ne sasno.", "metadata": { "languages": [ "hrv" @@ -1784,7 +1784,7 @@ { "type": "NarrativeText", "element_id": "b29e38dc8292efa10880271bbb145f07", - "text": "Chinantec, Ojitl\u00e1n La juu dsa lu si\u00e4 \u2013Dsa k\u00f6 \u00f1i ba dsa, n\u00eda k\u00f6 ni' ba na lu' dsa e dsa t\u00ef \u00e9 li jnia' ro\u00f6'.", + "text": "Chinantec, Ojitlán La juu dsa lu siä –Dsa kö ñi ba dsa, nía kö ni' ba na lu' dsa e dsa tï é li jnia' roö'.", "metadata": { "languages": [ "fin", @@ -1827,7 +1827,7 @@ { "type": "Title", "element_id": "be604439089a8fedd5abdc4d81187599", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5fd7\u5411\u8ddf\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u6e20\u4eec\u8d4b\u6709\u7406\u6027\u8ddf\u826f\u5fc3\uff0c\u5e76\u7406\u5f53\u4ee5\u5f1f\u5144\u4e49\u6c14\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在志向跟权利上一律平等。渠们赋有理性跟良心,并理当以弟兄义气相对待。", "metadata": { "languages": [ "zho" @@ -1869,7 +1869,7 @@ { "type": "Title", "element_id": "05e53430ff030465078e511efc0de0b2", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u540c\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4f62\u4e01\u4eba\u8d4b\u6709\u7406\u6027\u540c\u597d\u5fc3\u7530\uff0c\u5e76\u5e94\u4ee5\u5144\u5f1f\u5173\u7cfb\u4e2a\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在尊严同权利上一律平等。佢丁人赋有理性同好心田,并应以兄弟关系个精神相对待。", "metadata": { "languages": [ "zho", @@ -1912,7 +1912,7 @@ { "type": "Title", "element_id": "549cb1628fe3e0cafb78cd92f08f0554", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u5f1f\u5144\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在尊严和权利上一律平等。他们赋有理性和良心,并应以弟兄关系的精神相对待。", "metadata": { "languages": [ "zho", @@ -1955,7 +1955,7 @@ { "type": "Title", "element_id": "bf0df306ed131c2adf4243ded3865e6a", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u6328\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u522c\u5e73\u7b49\u3002\u4ed6\u4eec\u8d81\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u4e00\u4e2a\u5ea7\u513f\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,挨尊严和权利上一刬平等。他们趁理性和良心,并应以一个座儿的精神相对待。", "metadata": { "languages": [ "zho", @@ -1998,7 +1998,7 @@ { "type": "Title", "element_id": "ba1e57780fc9d286c63be7e8e73e3c2e", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u5f8b\u662f\u5e73\u7b49\u7684\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u4e92\u5bf9\u5f85\u3002", + "text": "人人生而自由,在尊严和权利上一律是平等的。他们赋有理性和良心,并应以兄弟关系的精神相互对待。", "metadata": { "languages": [ "zho" @@ -2040,7 +2040,7 @@ { "type": "Title", "element_id": "bdf44eafec897495cf404ac895e41ee3", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e4b\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u54e5\u4eec\u5f1f\u5144\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在尊严和权利之上一律平等。他们赋有理性和良心,并应以哥们弟兄的精神相对待。", "metadata": { "languages": [ "zho" @@ -2082,7 +2082,7 @@ { "type": "Title", "element_id": "a96206ba057e6ac6c0fdb4c87d21a1c9", - "text": "\u5927\u5bb6\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u544a\u6743\u5229\u4e0a\u5934\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u544a\u826f\u5fc3\uff0c\u5e76\u8be5\u6d3e\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "大家生而自由,在尊严告权利上头一律平等。他们赋有理性告良心,并该派以兄弟关系的精神相对待。", "metadata": { "languages": [ "zho", @@ -2125,7 +2125,7 @@ { "type": "Title", "element_id": "c185fc727614ade15888d1e8c9a00c4d", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531,\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3,\u5e76\u5e94\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在尊严和权利上一律平等。他们赋有理性和良心,并应以兄弟关系的精神相对待。", "metadata": { "languages": [ "zho", @@ -2168,7 +2168,7 @@ { "type": "Title", "element_id": "9e8a7703ae5139a2870b236cfa54cfd6", - "text": "\u4eba\u4e2a\u9876\u4e2a\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u822c\u513f\u822c\u513f\u5927\u3002\u4ed6\u4eec\u8d81\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人个顶个生而自由,在尊严和权利上般儿般儿大。他们趁理性和良心,并应以兄弟关系的精神相对待。", "metadata": { "languages": [ "zho" @@ -2210,7 +2210,7 @@ { "type": "Title", "element_id": "0e1d6539c2001d2ba8e3188f43b83f7f", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u56b4\u548c\u6b0a\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u5011\u8ce6\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u4e26\u61c9\u4ee5\u5144\u5f1f\u95dc\u4fc2\u7684\u7cbe\u795e\u76f8\u5c0d\u5f85\u3002", + "text": "人人生而自由,在尊嚴和權利上一律平等。他們賦有理性和良心,並應以兄弟關係的精神相對待。", "metadata": { "languages": [ "kor", @@ -2253,7 +2253,7 @@ { "type": "Title", "element_id": "48659e28c3b04b69caeaa16aded28f58", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u5408\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u56e0\u8d4b\u6709\u813e\u80c3\u5408\u9053\u884c\uff0c\u5e76\u7740\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在尊严合权利上一律平等。因赋有脾胃合道行,并着以兄弟关系的精神相对待。", "metadata": { "languages": [ "zho", @@ -2296,7 +2296,7 @@ { "type": "Title", "element_id": "c8272c39e78f413c6902b423da92287d", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u62c9\u5c0a\u4e25\u8131\u4ed4\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4f0a\u62c9\u6709\u7406\u6027\u8131\u4ed4\u826f\u5fc3\uff0c\u5e76\u5e94\u4ee5\u5144\u5f1f\u5173\u7cfb\u4e2a\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,拉尊严脱仔权利上一律平等。伊拉有理性脱仔良心,并应以兄弟关系个精神相对待。", "metadata": { "languages": [ "zho", @@ -2339,7 +2339,7 @@ { "type": "Title", "element_id": "7d70d884e74db8b4302ba0589166c634", - "text": "\u4eba\u4eba\u751f\u800c\u81ea\u7531\uff0c\u5728\u5c0a\u4e25\u548c\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4ed6\u4eec\u8d4b\u6709\u7406\u6027\u548c\u826f\u5fc3\uff0c\u5728\u5f97\u4ee5\u5144\u5f1f\u5173\u7cfb\u7684\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而自由,在尊严和权利上一律平等。他们赋有理性和良心,在得以兄弟关系的精神相对待。", "metadata": { "languages": [ "zho" @@ -2381,7 +2381,7 @@ { "type": "Title", "element_id": "932a20508f1be7b3c6fa54b0f9e46f14", - "text": "\u4eba\u4eba\u751f\u800c\u5e73\u7b49\uff0c\u55ba\u5c0a\u4e25\u540c\u57cb\u6743\u5229\u4e0a\u4e00\u5f8b\u5e73\u7b49\u3002\u4f62\u54cb\u6709\u7406\u6027\u540c\u57cb\u826f\u5fc3\uff0c\u800c\u4e14\u5e94\u5f53\u4ee5\u5144\u5f1f\u5173\u7cfb\u5605\u7cbe\u795e\u76f8\u5bf9\u5f85\u3002", + "text": "人人生而平等,喺尊严同埋权利上一律平等。佢哋有理性同埋良心,而且应当以兄弟关系嘅精神相对待。", "metadata": { "languages": [ "kor", @@ -2467,7 +2467,7 @@ { "type": "NarrativeText", "element_id": "93683f443b25a57d05bfb3b2ab1533a8", - "text": "Chuvash \u041f\u0443\u0440 \u0445\u0430\u043b\u04d1\u0445 \u0442\u0430 \u0443\u0439\u0440\u04d1\u043c \u043f\u0443\u0440\u04d1\u043d\u043c\u0430 \u043f\u04d7\u0440 \u0442\u0430\u043d \u043f\u0440\u0430\u0432\u0430\u043b\u043b\u04d1. \u04aa\u0430\u043a \u043f\u0440\u0430\u0432\u0430\u043f\u0430 \u0443\u0441\u04d1 \u043a\u0443\u0440\u0441\u0430 \u0432\u04d7\u0441\u0435\u043c \u0445\u04d1\u0439\u0441\u0435\u043d \u043f\u043e\u043b\u0438\u0442\u0438\u043a\u0430 \u0441\u0442\u0430\u0442\u0443\u0441\u043d\u0435 \u0438\u0440\u04d7\u043a\u043b\u04d7\u043d \u0442\u0443\u0441\u0430 \u0445\u0443\u0440\u0430\u04ab\u04ab\u04d7, \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430, \u043e\u0431\u0449\u0435\u0441\u0442\u0432\u043e \u0442\u0430\u0442\u0430 \u043a\u0443\u043b\u044c\u0442\u0443\u0440\u0430 \u0435\u043d\u04d7\u043f\u0435 \u0438\u0440\u04d7\u043a\u043b\u04d7\u043d \u0430\u0442\u0430\u043b\u0430\u043d\u0430\u04ab\u04ab\u04d7. \u041f\u0430\u0442\u0448\u0430\u043b\u04d1\u0445\u0441\u0435\u043d \u04ab\u0430\u043a \u043f\u0440\u0430\u0432\u04d1\u043d\u0430 \u0445\u0438\u0441\u0435\u043f\u043b\u0435\u043c\u0435\u043b\u043b\u0435, \u0442\u0435\u0440\u0440\u0438\u0442\u043e\u0440\u0438 \u043f\u04d7\u0440 \u043f\u04d7\u0442\u04d7\u043c\u043b\u04d7\u0445\u04d7\u043d \u043f\u0440\u0438\u043d\u0446\u0438\u043f\u04d7 \u0443\u043d\u043f\u0430 \u0443\u0441\u04d1 \u043a\u0443\u0440\u043c\u0430 \u043f\u04d7\u0440 \u0435\u043d\u043b\u04d7\u043d \u0447\u0430\u0440\u0441\u0430 \u0442\u04d1\u0440\u0430\u043a\u0430\u043d \u0447\u04d1\u0440\u043c\u0430\u0432 \u043f\u0443\u043b\u043c\u0430\u043b\u043b\u0430 \u043c\u0430\u0440.", + "text": "Chuvash Пур халӑх та уйрӑм пурӑнма пӗр тан праваллӑ. Ҫак правапа усӑ курса вӗсем хӑйсен политика статусне ирӗклӗн туса хураҫҫӗ, экономика, общество тата культура енӗпе ирӗклӗн аталанаҫҫӗ. Патшалӑхсен ҫак правӑна хисеплемелле, территори пӗр пӗтӗмлӗхӗн принципӗ унпа усӑ курма пӗр енлӗн чарса тӑракан чӑрмав пулмалла мар.", "metadata": { "languages": [ "rus", @@ -2511,7 +2511,7 @@ { "type": "NarrativeText", "element_id": "7829c582fafb0be79ca15885a9ffe253", - "text": "Comorian, Maore Wanadamu piya udzalwa huru tsena sawa ha ufahari na ha haki. Na wawo wana \u00e3kili na hisi, esa ilazimu wadzivhinge na wanyao ha fikira ya unanya.", + "text": "Comorian, Maore Wanadamu piya udzalwa huru tsena sawa ha ufahari na ha haki. Na wawo wana ãkili na hisi, esa ilazimu wadzivhinge na wanyao ha fikira ya unanya.", "metadata": { "languages": [ "swa" @@ -2553,7 +2553,7 @@ { "type": "NarrativeText", "element_id": "8aea2ff9710269cb8bdfd811de62b8cd", - "text": "Corsican Nascinu tutti l\u2019omi libari \u00e8 pari di dignit\u00e0 \u00e8 di diritti. Pussedinu a raghjoni \u00e8 a cuscenza \u00e8 li tocca ad agiscia tr\u00e0 elli di modu fraternu.", + "text": "Corsican Nascinu tutti l’omi libari è pari di dignità è di diritti. Pussedinu a raghjoni è a cuscenza è li tocca ad agiscia trà elli di modu fraternu.", "metadata": { "languages": [ "ita" @@ -2574,7 +2574,7 @@ { "type": "UncategorizedText", "element_id": "7174e554bd11372c5e339ba08b9881ab", - "text": "Cree, Swampy \u14a5\u14ef\u140c \u1403\u14c2\u14c2\u1424 \u144e\u142f\u14c2\u14a5\u144e\u14f1\u140e\u14c2\u1420 \u1401\u1511 \u14c2\u1455\u140e\u146d\u141f \u14c0\u1422\u1455 \u142f\u152d\u147e\u1423 \u146d\u148b \u1403\u1511 \u1472\u14c7\u1417\u1438\u14a5\u146f\u140e\u14ef\u141f \u146d\u1422\u144c\u14c2\u14a5\u144e\u14f1\u140e\u14c2\u1420 \u14c0\u1422\u1455 \u14a5\u14c2\u146f\u140e\u14ef\u140e\u14c7\u166e \u1401 \u1438\u146d\u144e\u14c7\u14aa\u148b\u1420 \u1472\u146b\u1455\u140c\u14c2\u1455\u14a7\u140e\u14c2\u14c2\u1424 \u14c0\u1422\u1455 \u14a5\u1450\u14c0\u14c2\u148b\u1472\u14c2\u14c2\u1424 \u14c0\u1422\u1455 \u140e\u148b\u1474\u14ef\u1450\u140e\u14c2\u1420 \u146d\u148b \u1403\u1511 \u1472\u14c7\u1417\u1438\u14a5\u1450\u148b\u1420\u166e", + "text": "Cree, Swampy ᒥᓯᐌ ᐃᓂᓂᐤ ᑎᐯᓂᒥᑎᓱᐎᓂᐠ ᐁᔑ ᓂᑕᐎᑭᐟ ᓀᐢᑕ ᐯᔭᑾᐣ ᑭᒋ ᐃᔑ ᑲᓇᐗᐸᒥᑯᐎᓯᐟ ᑭᐢᑌᓂᒥᑎᓱᐎᓂᐠ ᓀᐢᑕ ᒥᓂᑯᐎᓯᐎᓇ᙮ ᐁ ᐸᑭᑎᓇᒪᒋᐠ ᑲᑫᑕᐌᓂᑕᒧᐎᓂᓂᐤ ᓀᐢᑕ ᒥᑐᓀᓂᒋᑲᓂᓂᐤ ᓀᐢᑕ ᐎᒋᑴᓯᑐᐎᓂᐠ ᑭᒋ ᐃᔑ ᑲᓇᐗᐸᒥᑐᒋᐠ᙮", "metadata": { "filetype": "text/plain", "data_source": { @@ -2592,7 +2592,7 @@ { "type": "NarrativeText", "element_id": "952f38639569c0ef489cc6ebb4e809a7", - "text": "Crimean Tatar B\u00fct\u00fcn insanlar serbestlik, menlik ve uquqlarda musaviy ol\u0131p d\u00fcnya\u011fa keleler. Olar aq\u0131l ve vicdan saibidirler ve biri-birilerinen qarda\u015f\u00e7as\u0131na munasebette bulunmal\u0131d\u0131rlar", + "text": "Crimean Tatar Bütün insanlar serbestlik, menlik ve uquqlarda musaviy olıp dünyağa keleler. Olar aqıl ve vicdan saibidirler ve biri-birilerinen qardaşçasına munasebette bulunmalıdırlar", "metadata": { "languages": [ "tur" @@ -2613,7 +2613,7 @@ { "type": "NarrativeText", "element_id": "2ed33ba01de24e402f5963e9b2b56328", - "text": "Crioulo, Upper Guinea Tudu pekaduris ta padidu libri i igual na balur suma na diritus. Suma e dadu kapasidadi di pensa, e tene tambi konsiensia, e dibi di trata \u00f1utru suma ermons.", + "text": "Crioulo, Upper Guinea Tudu pekaduris ta padidu libri i igual na balur suma na diritus. Suma e dadu kapasidadi di pensa, e tene tambi konsiensia, e dibi di trata ñutru suma ermons.", "metadata": { "languages": [ "ind", @@ -2635,7 +2635,7 @@ { "type": "NarrativeText", "element_id": "8eb33fe9d9a2a68e6a146718f7b97d24", - "text": "Crioulo, Upper Guinea (008) Tudu pecadur padidu livre, ninguin ca m\u00e1s ninguin, tudu djusta, tudu tem mesmu diritu. Tudu quin qui padidu, tem si ro\u00e7on, cu si manera di pensa. Na metadi di utrus I d\u00edbidi fassi cussas cu ermondadi.", + "text": "Crioulo, Upper Guinea (008) Tudu pecadur padidu livre, ninguin ca más ninguin, tudu djusta, tudu tem mesmu diritu. Tudu quin qui padidu, tem si roçon, cu si manera di pensa. Na metadi di utrus I díbidi fassi cussas cu ermondadi.", "metadata": { "languages": [ "ita", @@ -2659,7 +2659,7 @@ { "type": "NarrativeText", "element_id": "9a87923b32ddc3eb20ab733920e58198", - "text": "Croatian Sva ljudska bi\u0107a ra\u0111aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i svije\u0161\u0107u i treba da jedno prema drugome postupaju u duhu bratstva.", + "text": "Croatian Sva ljudska bića rađaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sviješću i treba da jedno prema drugome postupaju u duhu bratstva.", "metadata": { "languages": [ "hrv" @@ -2680,7 +2680,7 @@ { "type": "NarrativeText", "element_id": "0666ab63ad7ac65ec7290cb18d27749d", - "text": "Czech V\u0161ichni lid\u00e9 rod\u00ed se svobodn\u00ed a sob\u011b rovn\u00ed co do d\u016fstojnosti a pr\u00e1v. Jsou nad\u00e1ni rozumem a sv\u011bdom\u00edm a maj\u00ed spolu jednat v duchu bratrstv\u00ed.", + "text": "Czech Všichni lidé rodí se svobodní a sobě rovní co do důstojnosti a práv. Jsou nadáni rozumem a svědomím a mají spolu jednat v duchu bratrství.", "metadata": { "languages": [ "ces" @@ -2701,7 +2701,7 @@ { "type": "NarrativeText", "element_id": "cb7b177025447a197e5f95166eeb0282", - "text": "Dagaare, Southern Nengsaala zaa ba nang d\u0254ge so la o menga, ka o ne o taaba zaa sengtaa noba emmo ane y\u025bl\u025bsoobo sobic po\u0254. Ba d\u0254g\u025b\u025b ba zaa ne y\u025bng ane y\u025bl\u025b-iruu k'a da seng ka ba er\u025b y\u025bl\u025b kor\u0254 taa a nga y\u0254\u0254mine.", + "text": "Dagaare, Southern Nengsaala zaa ba nang dɔge so la o menga, ka o ne o taaba zaa sengtaa noba emmo ane yɛlɛsoobo sobic poɔ. Ba dɔgɛɛ ba zaa ne yɛng ane yɛlɛ-iruu k'a da seng ka ba erɛ yɛlɛ korɔ taa a nga yɔɔmine.", "metadata": { "languages": [ "tgl", @@ -2723,7 +2723,7 @@ { "type": "NarrativeText", "element_id": "8e66c9e0bff4a344e85d8767b43fd67a", - "text": "Dagbani Sal' la sala. B\u025bhig' be sokam sanimi, din pa la amii. Suhiz\u0254bo be sokam sani; ka namb\u0254\u0263u beni. Suhub\u0254hibo mi bi lan k\u0254\u014b yigunaadam kam sani. Dinzu\u0263u dimb\u0254\u014b\u0254 zaa wuhiya ka dama di tu kamaata ka ti zaa yu tab' hali ni ti puuni.", + "text": "Dagbani Sal' la sala. Bɛhig' be sokam sanimi, din pa la amii. Suhizɔbo be sokam sani; ka nambɔɣu beni. Suhubɔhibo mi bi lan kɔŋ yigunaadam kam sani. Dinzuɣu dimbɔŋɔ zaa wuhiya ka dama di tu kamaata ka ti zaa yu tab' hali ni ti puuni.", "metadata": { "languages": [ "swa", @@ -2746,7 +2746,7 @@ { "type": "NarrativeText", "element_id": "b90d9e9d9c05b4f6982b37bbe3c37e9f", - "text": "Dangme Adesahi tsuo \u0254, a b\u0254 m\u025b n\u025b n\u0254 f\u025b\u025b n\u0254 e ye e he, n\u025b n\u0254 tsuaa n\u0254s\u0254 ng\u025b odehe si himi k\u025b he bl\u0254hi a bl\u0254 fa mi. A b\u0254 m\u025b k\u025b n\u0254\u0301 se k\u0254mi k\u025b he nule ju\u025bmi, n\u025b e hia kaa n\u0254 f\u025b\u025b n\u0254 n\u025b e na ny\u025bmi su\u0254mi k\u025b ha n\u0254 tsuaa n\u0254.", + "text": "Dangme Adesahi tsuo ɔ, a bɔ mɛ nɛ nɔ fɛɛ nɔ e ye e he, nɛ nɔ tsuaa nɔsɔ ngɛ odehe si himi kɛ he blɔhi a blɔ fa mi. A bɔ mɛ kɛ nɔ́ se kɔmi kɛ he nule juɛmi, nɛ e hia kaa nɔ fɛɛ nɔ nɛ e na nyɛmi suɔmi kɛ ha nɔ tsuaa nɔ.", "metadata": { "languages": [ "sqi", @@ -2769,7 +2769,7 @@ { "type": "NarrativeText", "element_id": "334d7844545ea360de232426f24cc228", - "text": "Danish Alle mennesker er f\u00f8dt frie og lige i v\u00e6rdighed og rettigheder. De er udstyret med fornuft og samvittighed, og de b\u00f8r handle mod hverandre i en broderskabets \u00e5nd.", + "text": "Danish Alle mennesker er født frie og lige i værdighed og rettigheder. De er udstyret med fornuft og samvittighed, og de bør handle mod hverandre i en broderskabets ånd.", "metadata": { "languages": [ "dan" @@ -2790,7 +2790,7 @@ { "type": "NarrativeText", "element_id": "12deb838666ab6083a3dba9696b9fba1", - "text": "Dari \u062a\u0645\u0627\u0645 \u0627\u0641\u0631\u0627\u062f \u0628\u0634\u0631 \u0622\u0632\u0627\u062f \u0628\u0647 \u062f\u0646\u06cc\u0627 \u0645\u06cc\u200c\u0622\u06cc\u0646\u062f \u0648 \u0627\u0632 \u0644\u062d\u0627\u0638 \u062d\u06cc\u062b\u06cc\u062a \u0648 \u062d\u0642\u0648\u0642 \u0628\u0627 \u0647\u0645 \u0628\u0631\u0627\u0628\u0631\u0646\u062f. \u0647\u0645\u0647 \u062f\u0627\u0631\u0627\u06cc \u0639\u0642\u0644 \u0648 \u0648\u062c\u062f\u0627\u0646 \u0647\u0633\u062a\u0646\u062f \u0648 \u0628\u0627\u06cc\u062f \u0646\u0633\u0628\u062a \u0628\u0647 \u06cc\u06a9\u062f\u06cc\u06af\u0631 \u0628\u0627 \u0631\u0648\u062d \u0628\u0631\u0627\u062f\u0631\u06cc \u0631\u0641\u062a\u0627\u0631 \u06a9\u0646\u0646\u062f.", + "text": "Dari تمام افراد بشر آزاد به دنیا می‌آیند و از لحاظ حیثیت و حقوق با هم برابرند. همه دارای عقل و وجدان هستند و باید نسبت به یکدیگر با روح برادری رفتار کنند.", "metadata": { "languages": [ "fas" @@ -2811,7 +2811,7 @@ { "type": "NarrativeText", "element_id": "3551715d069482f6ec4dba0cd2418882", - "text": "Dendi Aduniya kuna n gu ibuna damayo h\u025bi n\u0254 dei-dei nn daama nna n burucinit\u025br\u025b f\u0254, n lasabu nna laakari ya nam nn m\u0254 huro c\u025br\u025b kuna nyanze t\u025br\u025b b\u0254\u014b\u0254\u0254.", + "text": "Dendi Aduniya kuna n gu ibuna damayo hɛi nɔ dei-dei nn daama nna n burucinitɛrɛ fɔ, n lasabu nna laakari ya nam nn mɔ huro cɛrɛ kuna nyanze tɛrɛ bɔŋɔɔ.", "metadata": { "languages": [ "swa", @@ -2834,7 +2834,7 @@ { "type": "NarrativeText", "element_id": "ac128efe598097cdb68a483b1ea1f22c", - "text": "Dinka, Northeastern Raan th\u00f6k eben aye dh\u00eb\u00ebth ka lau nh\u00f6m kua th\u00f6\u014b nhiim eyithiic, kua th\u025b\u0308kic, kua ci y\u00ebknhiethku puou, ku bik c\u00eb\u014b ka ke ye mith etik.", + "text": "Dinka, Northeastern Raan thök eben aye dhëëth ka lau nhöm kua thöŋ nhiim eyithiic, kua thɛ̈kic, kua ci yëknhiethku puou, ku bik cëŋ ka ke ye mith etik.", "metadata": { "languages": [ "sqi", @@ -2856,7 +2856,7 @@ { "type": "NarrativeText", "element_id": "377f3dff94511f4733f9a8fa47685f8a", - "text": "Ditammari Oniti ti p\u025bi n\u0256\u025b om\u0254\u0169 yi kpaatri ot\u0254u, k\u025b y\u025b\u0303 oniti ba we, o yi \u0256o nn\u025b f\u025bh\u0254\u0303f\u025b; o m\u0254k\u025bmu m\u025bcii k\u025bh\u00e3 m\u025by\u025bmm\u025b. Ti t\u00fa n\u025b \u0256o kenyari ti t\u0254b\u025b mb\u025b k\u025b yie mii ba nkwu\u0254 ko ot\u0254u \u0256au.", + "text": "Ditammari Oniti ti pɛi nɖɛ omɔũ yi kpaatri otɔu, kɛ yɛ̃ oniti ba we, o yi ɖo nnɛ fɛhɔ̃fɛ; o mɔkɛmu mɛcii kɛhã mɛyɛmmɛ. Ti tú nɛ ɖo kenyari ti tɔbɛ mbɛ kɛ yie mii ba nkwuɔ ko otɔu ɖau.", "metadata": { "languages": [ "swa", @@ -2922,7 +2922,7 @@ { "type": "Title", "element_id": "58343bf1070d7f16553f03d984ab9241", - "text": "Dzongkha \u0f60\u0f42\u0fb2\u0f7c\u0f0b\u0f56\u0f0b\u0f58\u0f72\u0f0b\u0f5a\u0f74\u0f0b\u0f42\u0f0b\u0f62\u0f0b\u0f51\u0f63\u0f0b\u0f51\u0f56\u0f44\u0f0b\u0f42\u0f72\u0f0b\u0f50\u0f7c\u0f42\u0f0b\u0f63\u0f66\u0f0b\u0f66\u0f90\u0fb1\u0f7a\u0f66\u0f0b\u0f4f\u0f7a\u0f0b\u0f61\u0f7c\u0f51\u0f54\u0f0b\u0f63\u0f66\u0f0b \u0f42\u0f0b\u0f62\u0f0b\u0f63\u0f74\u0f0b\u0f56\u0f62\u0fa9\u0f72\u0f0b\u0f58\u0f50\u0f7c\u0f44\u0f0b\u0f51\u0f44\u0f0b\u0f50\u0f7c\u0f56\u0f0b\u0f51\u0f56\u0f44\u0f0b\u0f60\u0f51\u0fb2\u0f0b\u0f58\u0f49\u0f58\u0f0b\u0f66\u0fa6\u0f7a\u0f0b\u0f61\u0f7c\u0f51\u0f0d \u0f58\u0f72\u0f0b\u0f5a\u0f74\u0f0b\u0f42\u0f0b\u0f62\u0f0b\u0f66\u0fa8\u0fb2\u0f0b\u0f64\u0f7a\u0f66\u0f0b\u0f51\u0f7c\u0f53\u0f0b\u0f42\u0f7c\u0f0b\u0f56\u0f60\u0f72\u0f0b\u0f58\u0f5a\u0f53\u0f0b\u0f49\u0f72\u0f51\u0f0b\u0f51\u0f44\u0f0b\u0f63\u0fa1\u0f53\u0f58\u0f0b\u0f63\u0f66\u0f0b \u0f42\u0f0b\u0f62\u0f0b\u0f42\u0f72\u0f66\u0f0b\u0f63\u0f71\u0f0b\u0f42\u0f0b\u0f45\u0f72\u0f0b\u0f62\u0f0b\u0f60\u0f56\u0f51\u0f0b\u0f62\u0f74\u0f44\u0f0b \u0f42\u0f45\u0f72\u0f42\u0f0b\u0f42\u0f72\u0f66\u0f0b\u0f42\u0f45\u0f72\u0f42\u0f0b\u0f63\u0f74\u0f0b\u0f66\u0fa4\u0f74\u0f53\u0f0b\u0f46\u0f60\u0f72\u0f0b\u0f60\u0f51\u0f74\u0f0b\u0f64\u0f7a\u0f66\u0f0b\u0f56\u0f66\u0f90\u0fb1\u0f7a\u0f51\u0f0b\u0f50\u0f7c\u0f42\u0f0b\u0f63\u0f66\u0f0b\u0f63\u0f71\u0f0b\u0f60\u0f56\u0f51\u0f0b\u0f51\u0f42\u0f7c\u0f0d", + "text": "Dzongkha འགྲོ་བ་མི་ཚུ་ག་ར་དལ་དབང་གི་ཐོག་ལས་སྐྱེས་ཏེ་ཡོདཔ་ལས་ ག་ར་ལུ་བརྩི་མཐོང་དང་ཐོབ་དབང་འདྲ་མཉམ་སྦེ་ཡོད། མི་ཚུ་ག་ར་སྨྲ་ཤེས་དོན་གོ་བའི་མཚན་ཉིད་དང་ལྡནམ་ལས་ ག་ར་གིས་ལཱ་ག་ཅི་ར་འབད་རུང་ གཅིག་གིས་གཅིག་ལུ་སྤུན་ཆའི་འདུ་ཤེས་བསྐྱེད་ཐོག་ལས་ལཱ་འབད་དགོ།", "metadata": { "filetype": "text/plain", "data_source": { @@ -2982,7 +2982,7 @@ { "type": "NarrativeText", "element_id": "7d5794631564e8ff8a2bf245087903a4", - "text": "Ese Ejja Ojja\u00f1a esejja ojja\u00f1a oyaja yojjaya cuayani quiapame oyajayojjaya quiapame ojja\u00f1a eseya quiapame quia tai jjashauabataiquiani ecueya epejji jayo jjaya ojja\u00f1a jajji ojja\u00f1ajaassi eseyajayojja.", + "text": "Ese Ejja Ojjaña esejja ojjaña oyaja yojjaya cuayani quiapame oyajayojjaya quiapame ojjaña eseya quiapame quia tai jjashauabataiquiani ecueya epejji jayo jjaya ojjaña jajji ojjañajaassi eseyajayojja.", "metadata": { "languages": [ "swa", @@ -3006,7 +3006,7 @@ { "type": "NarrativeText", "element_id": "5f8fd43155bbf931b71069f21ba6a609", - "text": "Esperanto \u0108iuj homoj estas denaske liberaj kaj egalaj la\u016d digno kaj rajtoj. Ili posedas racion kaj konsciencon, kaj devus konduti unu al alia en spirito de frateco.", + "text": "Esperanto Ĉiuj homoj estas denaske liberaj kaj egalaj laŭ digno kaj rajtoj. Ili posedas racion kaj konsciencon, kaj devus konduti unu al alia en spirito de frateco.", "metadata": { "languages": [ "slv", @@ -3028,7 +3028,7 @@ { "type": "NarrativeText", "element_id": "e59c6075ee4dbde4faa66c2bdc180029", - "text": "Estonian K\u00f5ik inimesed s\u00fcnnivad vabadena ja v\u00f5rdsetena oma v\u00e4\u00e4rikuselt ja \u00f5igustelt. Neile on antud m\u00f5istus ja s\u00fcdametunnistus ja nende suhtumist \u00fcksteisesse peab kandma vendluse vaim.", + "text": "Estonian Kõik inimesed sünnivad vabadena ja võrdsetena oma väärikuselt ja õigustelt. Neile on antud mõistus ja südametunnistus ja nende suhtumist üksteisesse peab kandma vendluse vaim.", "metadata": { "languages": [ "est" @@ -3049,7 +3049,7 @@ { "type": "NarrativeText", "element_id": "699838930374f69143263bd99d88883e", - "text": "Even \u0411\u044d\u0439\u0438\u043b \u0431\u043e\u043a\u044d\u0442\u0447\u0443\u0440 \u043e\u043c\u044d\u043d \u0445\u0438\u043b\u043a\u0438\u0447 \u043d\u044f\u043d \u0443\u0440\u0443\u043c\u043a\u044d\u0440 \u0431\u0430\u043b\u0434\u0430\u0440\u0438\u0442\u043d\u043e, \u0442\u0435\u043c\u0438 \u043d\u043e\u04a5\u0430\u0440\u0434\u0443\u043a \u044d\u0433\u0434\u044c\u044d\u043d \u04a5\u0438\u2010\u0434\u0430 \u0430\u0447\u0447\u0430. \u0411\u044d\u0439\u0438\u043b \u0431\u04e9\u043a\u044d\u0442\u0447\u0443\u0440 \u043c\u044d\u043d \u0434\u043e\u043b\u0430\u043d \u0430\u043a\u0430\u0433\u0447\u0438\u043c\u0443\u0440 \u0431\u0438\u043d\u043d\u044d\u0442\u044b\u043d.", + "text": "Even Бэйил бокэтчур омэн хилкич нян урумкэр балдаритно, теми ноҥардук эгдьэн ҥи‐да ачча. Бэйил бөкэтчур мэн долан акагчимур биннэтын.", "metadata": { "languages": [ "rus" @@ -3070,7 +3070,7 @@ { "type": "NarrativeText", "element_id": "8164afd787069e69d3a6bed633cfdb21", - "text": "Evenki \u0423\u043f\u043a\u0430\u0442 \u0438\u043b\u044d\u043b \u0442\u044b\u0304\u043d\u043c\u0443\u043a\u0438\u0440\u0434\u0438, \u0443\u0440\u044d\u0304\u043b\u0434\u0438 \u043c\u044d\u0304\u043d\u04a3\u0438 \u0441\u0430\u0304\u0440\u0438\u0447\u0430\u0304\u0434\u0438 \u0431\u0430\u043b\u0434\u044b\u0434\u044f\u0440\u0430. \u041d\u0443\u04a3\u0430\u0440\u0442\u044b\u043d \u0434\u044f\u043b\u0438\u0442\u0432\u0438, \u04bb\u0430\u043b\u0434\u044f\u043d\u0434\u044b\u0432\u0438 \u0431\u0438\u0441\u0438, \u043c\u044d\u043c\u044d\u0433\u0438\u0304\u043b\u0432\u044d\u0440 \u0430\u044f\u0440\u0430\u043b\u0434\u044b\u0304\u0434\u044f\u043d\u0430 \u0442\u044d\u0434\u0435\u0442 \u043e\u0304\u043c\u0430\u043c\u0430\u0447\u0438\u0442\u044b\u043d.", + "text": "Evenki Упкат илэл ты̄нмукирди, урэ̄лди мэ̄нңи са̄рича̄ди балдыдяра. Нуңартын дялитви, һалдяндыви биси, мэмэгӣлвэр аяралды̄дяна тэдет о̄мамачитын.", "metadata": { "languages": [ "rus" @@ -3091,7 +3091,7 @@ { "type": "NarrativeText", "element_id": "8ba9631d337f32fb2b5a0049718f7162", - "text": "\u00c9w\u00e9 Wodzi amegbet\u0254wo kata\u0303 abl\u0254\u0256eviwoe eye wodzena bubu kple gomekp\u0254kp\u0254 s\u0254s\u0254e. Susu kple dzitsinya le wo domet\u0254 \u0256esia\u0256e si eyata wodze be woan\u0254 anyi le \u0256ekaw\u0254w\u0254 blibo me.", + "text": "Éwé Wodzi amegbetɔwo katã ablɔɖeviwoe eye wodzena bubu kple gomekpɔkpɔ sɔsɔe. Susu kple dzitsinya le wo dometɔ ɖesiaɖe si eyata wodze be woanɔ anyi le ɖekawɔwɔ blibo me.", "metadata": { "languages": [ "pol" @@ -3112,7 +3112,7 @@ { "type": "NarrativeText", "element_id": "4dad8f50be71b880b8d1cd3aa2083177", - "text": "Fante W\u0254wo adasa nyina to fahodzi mu, na h\u0254n nyina y\u025b p\u025br w\u0254 enyimnyam na ndzinoa mu. W\u0254maa h\u0254n nyina adwen na tsibowa, na \u0254w\u0254 d\u025b h\u0254n nkitahodzi mu ndzey\u025b\u025b da no edzi d\u025b w\u0254y\u025b enuanom.", + "text": "Fante Wɔwo adasa nyina to fahodzi mu, na hɔn nyina yɛ pɛr wɔ enyimnyam na ndzinoa mu. Wɔmaa hɔn nyina adwen na tsibowa, na ɔwɔ dɛ hɔn nkitahodzi mu ndzeyɛɛ da no edzi dɛ wɔyɛ enuanom.", "metadata": { "languages": [ "swa", @@ -3134,7 +3134,7 @@ { "type": "NarrativeText", "element_id": "f8e68d4590ad494f5d3039e113c1ac46", - "text": "Faroese \u00d8ll menniskju eru f\u00f8dd fr\u00e6ls og j\u00f8vn til vir\u00f0ingar og mannar\u00e6ttindi. Tey hava skil og samvitsku og eiga at fara hv\u00f8rt um anna\u00f0 \u00ed br\u00f3\u00f0uranda.", + "text": "Faroese Øll menniskju eru fødd fræls og jøvn til virðingar og mannarættindi. Tey hava skil og samvitsku og eiga at fara hvørt um annað í bróðuranda.", "metadata": { "languages": [ "nor" @@ -3155,7 +3155,7 @@ { "type": "NarrativeText", "element_id": "2f3af719eba5f3392f87df0894e56c42", - "text": "Farsi, Western \u062a\u0645\u0627\u0645 \u0627\u0641\u0631\u0627\u062f \u0628\u0634\u0631 \u0622\u0632\u0627\u062f \u0628\u062f\u0646\u06cc\u0627 \u0645\u06cc\u0627\u06cc\u0646\u062f \u0648 \u0627\u0632 \u0644\u062d\u0627\u0638 \u062d\u06cc\u062b\u06cc\u062a \u0648 \u062d\u0642\u0648\u0642 \u0628\u0627 \u0647\u0645 \u0628\u0631\u0627\u0628\u0631\u0646\u062f. \u0647\u0645\u0647 \u062f\u0627\u0631\u0627\u06cc \u0639\u0642\u0644 \u0648 \u0648\u062c\u062f\u0627\u0646 \u0645\u06cc\u0628\u0627\u0634\u0646\u062f \u0648 \u0628\u0627\u06cc\u062f \u0646\u0633\u0628\u062a \u0628\u06cc\u06a9\u062f\u06cc\u06af\u0631 \u0628\u0627 \u0631\u0648\u062d \u0628\u0631\u0627\u062f\u0631\u06cc \u0631\u0641\u062a\u0627\u0631 \u06a9\u0646\u0646\u062f.", + "text": "Farsi, Western تمام افراد بشر آزاد بدنیا میایند و از لحاظ حیثیت و حقوق با هم برابرند. همه دارای عقل و وجدان میباشند و باید نسبت بیکدیگر با روح برادری رفتار کنند.", "metadata": { "languages": [ "fas" @@ -3198,7 +3198,7 @@ { "type": "NarrativeText", "element_id": "b70785870cc673f7dcbb24c8464d43fc", - "text": "Finnish Kaikki ihmiset syntyv\u00e4t vapaina ja tasavertaisina arvoltaan ja oikeuksiltaan. Heille on annettu j\u00e4rki ja omatunto, ja heid\u00e4n on toimittava toisiaan kohtaan veljeyden hengess\u00e4.", + "text": "Finnish Kaikki ihmiset syntyvät vapaina ja tasavertaisina arvoltaan ja oikeuksiltaan. Heille on annettu järki ja omatunto, ja heidän on toimittava toisiaan kohtaan veljeyden hengessä.", "metadata": { "languages": [ "fin" @@ -3219,7 +3219,7 @@ { "type": "NarrativeText", "element_id": "ecc193afbaf5bf317c868860f5dfc5ec", - "text": "Finnish, Kven Kaikki ihmiset synnyth\u00e4\u00e4n vaphaina, ja heil\u00e4 kaikila oon sama ihmisarvo ja samat ihmisoikkeuet. Het oon saanheet j\u00e4rjen ja omatunnon, ja het pieth\u00e4\u00e4n ell\u00e4\u00e4t toinen toisen kans niin ko veljet keskenh\u00e4\u00e4n.", + "text": "Finnish, Kven Kaikki ihmiset synnythään vaphaina, ja heilä kaikila oon sama ihmisarvo ja samat ihmisoikkeuet. Het oon saanheet järjen ja omatunnon, ja het piethään elläät toinen toisen kans niin ko veljet keskenhään.", "metadata": { "languages": [ "fin" @@ -3240,7 +3240,7 @@ { "type": "NarrativeText", "element_id": "e2a252e076d508cd7e312c25eaf70331", - "text": "Fon Ac\u025b, susu kpo sisi \u0256okpo \u0254 kpo w\u025b gb\u025bt\u0254 bi \u0256o \u0256\u00f2 gb\u025bwiwa t\u0254n hwenu; ye \u0256o linkp\u0254n b\u0254 ayi yet\u0254n m\u025b kpe lo b\u0254 ye \u0256o na do al\u0254 ye\u0256ee \u0256i n\u0254vin\u0254vi \u0256\u0254hun.", + "text": "Fon Acɛ, susu kpo sisi ɖokpo ɔ kpo wɛ gbɛtɔ bi ɖo ɖò gbɛwiwa tɔn hwenu; ye ɖo linkpɔn bɔ ayi yetɔn mɛ kpe lo bɔ ye ɖo na do alɔ yeɖee ɖi nɔvinɔvi ɖɔhun.", "metadata": { "languages": [ "swa", @@ -3262,7 +3262,7 @@ { "type": "NarrativeText", "element_id": "d26195c0225bad321fc98f526b1fb27b", - "text": "French Tous les \u00eatres humains naissent libres et \u00e9gaux en dignit\u00e9 et en droits. Ils sont dou\u00e9s de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternit\u00e9.", + "text": "French Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.", "metadata": { "languages": [ "fra" @@ -3283,7 +3283,7 @@ { "type": "NarrativeText", "element_id": "f5ce0eb3d199445ab33436a396fca8cb", - "text": "Frisian, Western Alle minsken wurde frij en gelyk yn weardigens en rjochten berne. Hja hawwe ferst\u00e2n en gewisse meikrigen en hearre har foar inoar oer yn in geast fan bruorskip te h\u00e2lden en te dragen.", + "text": "Frisian, Western Alle minsken wurde frij en gelyk yn weardigens en rjochten berne. Hja hawwe ferstân en gewisse meikrigen en hearre har foar inoar oer yn in geast fan bruorskip te hâlden en te dragen.", "metadata": { "languages": [ "nld", @@ -3305,7 +3305,7 @@ { "type": "NarrativeText", "element_id": "0da991393fa9f40d78c4143c3a25b02a", - "text": "Friulian Ducj i oms a nassin libars e compagns come dignit\u00e2t e derits. A an sintiment e cussience e bisugne che si tratin un culaltri come fradis.", + "text": "Friulian Ducj i oms a nassin libars e compagns come dignitât e derits. A an sintiment e cussience e bisugne che si tratin un culaltri come fradis.", "metadata": { "languages": [ "ita" @@ -3326,7 +3326,7 @@ { "type": "NarrativeText", "element_id": "216db5a1011f211d9206a47a9e0e4839", - "text": "Fulfulde, Nigerian Innama aadeeji fof poti, ndim\u0257idi e jibinannde to bannge hakkeeji. E\u0253e ngoodi miijo e hakkilantaagal ete e\u0253e poti huufo ndirde e nder \u0253 iynguyummaagu.", + "text": "Fulfulde, Nigerian Innama aadeeji fof poti, ndimɗidi e jibinannde to bannge hakkeeji. Eɓe ngoodi miijo e hakkilantaagal ete eɓe poti huufo ndirde e nder ɓ iynguyummaagu.", "metadata": { "languages": [ "est", @@ -3348,7 +3348,7 @@ { "type": "NarrativeText", "element_id": "d245ad5ed3e4ee8727b8152745ffdba6", - "text": "Fulfulde, Nigerian (2) \u0181i-aadama fuu dimo danyete/jibinte o fotan be koomoye e ne\u0257\u0257aaku be hakkeeji. \u0253e ndokkaa\u0253e hakkiilo ngaandi nden bo \u0253e kuutindiray hakkunde ma\u0253\u0253e nder yi\u0257yi\u0257\u0257irki mbandiraagu.", + "text": "Fulfulde, Nigerian (2) Ɓi-aadama fuu dimo danyete/jibinte o fotan be koomoye e neɗɗaaku be hakkeeji. ɓe ndokkaaɓe hakkiilo ngaandi nden bo ɓe kuutindiray hakkunde maɓɓe nder yiɗyiɗɗirki mbandiraagu.", "metadata": { "languages": [ "som", @@ -3371,7 +3371,7 @@ { "type": "NarrativeText", "element_id": "71e526a7453aa9c044c6f695d1fe4c78", - "text": "Fur kwa-s\u00ed ny\u00e9tti\u014b baajt\u00f3l\u00e1 kereli n\u00e1s nisila na ta\u0331g\u0268d\u0268\u014b arr\u00e1 ka\u0331\u0268\u014b, Na\u014b-s\u00ed ugola na kilma\u014b\u00e1 arr\u00e1 ka\u0331\u0268\u014b nam\u00e1 in l\u00f3\u014b \u00e1l\u00e1\u014b s\u01d4r\u014b\u00e2-s\u00ed k\u00ed jai\u014ba in k\u00e9\u00e9l n\u00e1 s\u01d4r\u014b\u00e2 suur\ua78c\u00ed\u014b b\u00e2r\u014ba.", + "text": "Fur kwa-sí nyéttiŋ baajtólá kereli nás nisila na ta̱gɨdɨŋ arrá ka̱ɨŋ, Naŋ-sí ugola na kilmaŋá arrá ka̱ɨŋ namá in lóŋ áláŋ sǔrŋâ-sí kí jaiŋa in kéél ná sǔrŋâ suurꞌíŋ bârŋa.", "metadata": { "languages": [ "hun" @@ -3392,7 +3392,7 @@ { "type": "NarrativeText", "element_id": "dfd804850bd4d6daab5db7227283c3ab", - "text": "Ga Af\u0254 gb\u0254m\u0254 f\u025b\u025b gb\u0254m\u0254 y\u025b agbojee mli, k\u025b hegb\u025b ko ni dam\u0254 \u014b\u025bl\u025b koome n\u0254. Gb\u0254m\u025bi f\u025b\u025b y\u025b jw\u025b\u014bm\u0254 k\u025b henilee, ni no hew\u0254 l\u025b esa ak\u025b am\u025bhe ahi shi y\u025b ny\u025bmi su\u0254m\u0254 mli.", + "text": "Ga Afɔ gbɔmɔ fɛɛ gbɔmɔ yɛ agbojee mli, kɛ hegbɛ ko ni damɔ ŋɛlɛ koome nɔ. Gbɔmɛi fɛɛ yɛ jwɛŋmɔ kɛ henilee, ni no hewɔ lɛ esa akɛ amɛhe ahi shi yɛ nyɛmi suɔmɔ mli.", "metadata": { "languages": [ "swa", @@ -3414,7 +3414,7 @@ { "type": "NarrativeText", "element_id": "38140682ca9cf0b5c7f1cf203b331589", - "text": "Gaelic, Irish Saol\u00e1itear na daoine uile saor agus comhionann ina nd\u00ednit agus ina gcearta. T\u00e1 bauidh an r\u00e9as\u00fain agus an choinsiasa acu agus dl\u00edd iad f\u00e9in d'iompar de mheon bhrthreachais i leith a ch\u00e9ile.", + "text": "Gaelic, Irish Saoláitear na daoine uile saor agus comhionann ina ndínit agus ina gcearta. Tá bauidh an réasúin agus an choinsiasa acu agus dlíd iad féin d'iompar de mheon bhrthreachais i leith a chéile.", "metadata": { "languages": [ "eng", @@ -3436,7 +3436,7 @@ { "type": "NarrativeText", "element_id": "c74c5c12c1d20c63c0512bda5ec488ee", - "text": "Gaelic, Scottish Tha gach uile dhuine air a bhreth saor agus co-ionnan ann an urram 's ann an c\u00f2irichean. Tha iad air am breth le reusan is le cogais agus mar sin bu ch\u00f2ir dhaibh a bhith be\u00f2 nam measg fhein ann an spiorad br\u00e0thaireil,", + "text": "Gaelic, Scottish Tha gach uile dhuine air a bhreth saor agus co-ionnan ann an urram 's ann an còirichean. Tha iad air am breth le reusan is le cogais agus mar sin bu chòir dhaibh a bhith beò nam measg fhein ann an spiorad bràthaireil,", "metadata": { "languages": [ "eng" @@ -3457,7 +3457,7 @@ { "type": "NarrativeText", "element_id": "adb7eafcda17469d6dffe53ac281b9e7", - "text": "Gagauz Insannar hepsi duu\u00earlar serbest hem birtak\u0131m kendi k\u0131ymetind\u00e4 hem haklar\u0131nda. Onnara verilmi\u015f ak\u0131l hem \u00fcz da l\u00e4az\u0131m biri-birin\u00e4 davrans\u0131nnar karda\u015fl\u0131k ruhuna uygun.", + "text": "Gagauz Insannar hepsi duuêrlar serbest hem birtakım kendi kıymetindä hem haklarında. Onnara verilmiş akıl hem üz da läazım biri-birinä davransınnar kardaşlık ruhuna uygun.", "metadata": { "languages": [ "tur" @@ -3478,7 +3478,7 @@ { "type": "NarrativeText", "element_id": "d838922d035c343059a70e88f83100af", - "text": "Galician T\u00f3dolos seres humanos nacen libres e iguais en dignidade e dereitos e, dotados como est\u00e1n de raz\u00f3n e conciencia, d\u00edbense comportar fraternalmente uns cos outros.", + "text": "Galician Tódolos seres humanos nacen libres e iguais en dignidade e dereitos e, dotados como están de razón e conciencia, díbense comportar fraternalmente uns cos outros.", "metadata": { "languages": [ "spa" @@ -3522,7 +3522,7 @@ { "type": "UncategorizedText", "element_id": "ec7ace2c582cd24ef64d447f5e1e7a08", - "text": "Garifuna Sun g\u00fcrigia nas\u00edruati yuti lun, lidan \u00faarani, lawiwandun\u00ed lib\u00e1gari kai le aubai lab\u00fasienra, gatu gi\u00f1e lanagun lungua buidu hadan l\u00edbegu.", + "text": "Garifuna Sun gürigia nasíruati yuti lun, lidan úarani, lawiwanduní libágari kai le aubai labúsienra, gatu giñe lanagun lungua buidu hadan líbegu.", "metadata": { "languages": [ "ind" @@ -3543,7 +3543,7 @@ { "type": "NarrativeText", "element_id": "3db8c991f134adb8e84617cd84e56d43", - "text": "Gen Agbet\u0254wo kpata le jijim\u025ba, \u0256o vosin\u0254n\u0254, nyi gb\u00e8s\u0254\u025b\u0301m\u025b\u0301w\u00f3 le nuj\u0254nunnyi ku go\u0256oejisewo, am\u025bbusewo m\u025b. Tagb\u0254 le woa si, eye w\u0254nawo s\u0254doda woan\u0254n\u0254wo gb\u0254a la nyi n\u0254\u0301visil\u00e9l\u00e9.", + "text": "Gen Agbetɔwo kpata le jijimɛa, ɖo vosinɔnɔ, nyi gbèsɔɛ́mɛ́wó le nujɔnunnyi ku goɖoejisewo, amɛbusewo mɛ. Tagbɔ le woa si, eye wɔnawo sɔdoda woanɔnɔwo gbɔa la nyi nɔ́visilélé.", "metadata": { "languages": [ "swa", @@ -3565,7 +3565,7 @@ { "type": "NarrativeText", "element_id": "cb7127a24ce99f60f18c47121fcbe3cb", - "text": "Georgian \u10e7\u10dd\u10d5\u10d4\u10da\u10d8 \u10d0\u10d3\u10d0\u10db\u10d8\u10d0\u10dc\u10d8 \u10d8\u10d1\u10d0\u10d3\u10d4\u10d1\u10d0 \u10d7\u10d0\u10d5\u10d8\u10e1\u10e3\u10e4\u10d0\u10da\u10d8 \u10d3\u10d0 \u10d7\u10d0\u10dc\u10d0\u10e1\u10ec\u10dd\u10e0\u10d8 \u10d7\u10d0\u10d5\u10d8\u10e1\u10d8 \u10e6\u10d8\u10e0\u10e1\u10d4\u10d1\u10d8\u10d7\u10d0 \u10d3\u10d0 \u10e3\u10e4\u10da\u10d4\u10d1\u10d4\u10d1\u10d8\u10d7. \u10db\u10d0\u10d7 \u10db\u10d8\u10dc\u10d8\u10ed\u10d4\u10d1\u10e3\u10da\u10d8 \u10d0\u10e5\u10d5\u10d7 \u10d2\u10dd\u10dc\u10d4\u10d1\u10d0 \u10d3\u10d0 \u10e1\u10d8\u10dc\u10d3\u10d8\u10e1\u10d8 \u10d3\u10d0 \u10d4\u10e0\u10d7\u10db\u10d0\u10dc\u10d4\u10d7\u10d8\u10e1 \u10db\u10d8\u10db\u10d0\u10e0\u10d7 \u10e3\u10dc\u10d3\u10d0 \u10d4\u10e5\u10ea\u10d4\u10dd\u10d3\u10dc\u10d4\u10dc \u10eb\u10db\u10dd\u10d1\u10d8\u10e1 \u10e1\u10e3\u10da\u10d8\u10e1\u10d9\u10d5\u10d4\u10d7\u10d4\u10d1\u10d8\u10d7.", + "text": "Georgian ყოველი ადამიანი იბადება თავისუფალი და თანასწორი თავისი ღირსებითა და უფლებებით. მათ მინიჭებული აქვთ გონება და სინდისი და ერთმანეთის მიმართ უნდა ექცეოდნენ ძმობის სულისკვეთებით.", "metadata": { "languages": [ "est" @@ -3586,7 +3586,7 @@ { "type": "NarrativeText", "element_id": "60e95060440c3ac89b53764c839a9658", - "text": "German, Standard (1901) Alle Menschen sind frei und gleich an W\u00fcrde und Rechten geboren. Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist der Br\u00fcderlichkeit begegnen.", + "text": "German, Standard (1901) Alle Menschen sind frei und gleich an Würde und Rechten geboren. Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist der Brüderlichkeit begegnen.", "metadata": { "languages": [ "deu" @@ -3607,7 +3607,7 @@ { "type": "NarrativeText", "element_id": "d9454188531f323f4587d2668a35dce4", - "text": "German, Standard (1996) Alle Menschen sind frei und gleich an W\u00fcrde und Rechten geboren. Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist der Br\u00fcderlichkeit begegnen.", + "text": "German, Standard (1996) Alle Menschen sind frei und gleich an Würde und Rechten geboren. Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist der Brüderlichkeit begegnen.", "metadata": { "languages": [ "deu" @@ -3628,7 +3628,7 @@ { "type": "Title", "element_id": "82bf90db0534cabdc2efe2971f9bb4c6", - "text": "Gilyak \u0421\u0438\u043a \u043d\u0438\u0432\u0433\u0443\u043d \u043a\u0443\u0493\u044b\u0442\u04fb\u0430\u0440\u0442\u0430, \u043f\u02bc\u0438\u043d\u0430\u043c\u0430\u0434 \u044f\u0439\u043c\u0442\u0430 \u0430\u0434\u044f\u0439 \u043f\u0440\u0430\u0432\u043e\u0493\u0438\u0440\u030c \u043f\u02bc\u04ca\u0430\u0444\u049b-\u04ca\u0430\u0444\u049b\u0493\u0438\u0440\u030c \u0441\u0430\u043b\u04fb\u0430\u0442\u0430 \u04ff\u0430\u0442 \u043f\u0430\u043d\u0442\u0430\u0434\u0493\u0443\u043d.", + "text": "Gilyak Сик нивгун куғытӻарта, пʼинамад яймта адяй правоғир̌ пʼӊафқ-ӊафқғир̌ салӻата ӿат пантадғун.", "metadata": { "languages": [ "bul", @@ -3650,7 +3650,7 @@ { "type": "NarrativeText", "element_id": "d61fdd2d22e77149dff43d70d62d722f", - "text": "Gonja Bu kurwe dimedi kik\u025b mobe kumu so, n\u025b mobe, eyilikpa, keshe\u014b n\u025b kashinte\u014b ma\u014b k\u0254r eko pey\u025b to. Nyinpela sa dimedi kik\u025b lakal n\u025b mf\u025bra fan\u025b bu chena abarso kelepo so.", + "text": "Gonja Bu kurwe dimedi kikɛ mobe kumu so, nɛ mobe, eyilikpa, kesheŋ nɛ kashinteŋ maŋ kɔr eko peyɛ to. Nyinpela sa dimedi kikɛ lakal nɛ mfɛra fanɛ bu chena abarso kelepo so.", "metadata": { "languages": [ "swa", @@ -3673,7 +3673,7 @@ { "type": "NarrativeText", "element_id": "0361867eb371916c85e13fcc3dde7f4b", - "text": "Greek (monotonic) \u038c\u03bb\u03bf\u03b9 \u03bf\u03b9 \u03ac\u03bd\u03b8\u03c1\u03c9\u03c0\u03bf\u03b9 \u03b3\u03b5\u03bd\u03bd\u03b9\u03bf\u03cd\u03bd\u03c4\u03b1\u03b9 \u03b5\u03bb\u03b5\u03cd\u03b8\u03b5\u03c1\u03bf\u03b9 \u03ba\u03b1\u03b9 \u03af\u03c3\u03bf\u03b9 \u03c3\u03c4\u03b7\u03bd \u03b1\u03be\u03b9\u03bf\u03c0\u03c1\u03ad\u03c0\u03b5\u03b9\u03b1 \u03ba\u03b1\u03b9 \u03c4\u03b1 \u03b4\u03b9\u03ba\u03b1\u03b9\u03ce\u03bc\u03b1\u03c4\u03b1. \u0395\u03af\u03bd\u03b1\u03b9 \u03c0\u03c1\u03bf\u03b9\u03ba\u03b9\u03c3\u03bc\u03ad\u03bd\u03bf\u03b9 \u03bc\u03b5 \u03bb\u03bf\u03b3\u03b9\u03ba\u03ae \u03ba\u03b1\u03b9 \u03c3\u03c5\u03bd\u03b5\u03af\u03b4\u03b7\u03c3\u03b7, \u03ba\u03b1\u03b9 \u03bf\u03c6\u03b5\u03af\u03bb\u03bf\u03c5\u03bd \u03bd\u03b1 \u03c3\u03c5\u03bc\u03c0\u03b5\u03c1\u03b9\u03c6\u03ad\u03c1\u03bf\u03bd\u03c4\u03b1\u03b9 \u03bc\u03b5\u03c4\u03b1\u03be\u03cd \u03c4\u03bf\u03c5\u03c2 \u03bc\u03b5 \u03c0\u03bd\u03b5\u03cd\u03bc\u03b1 \u03b1\u03b4\u03b5\u03bb\u03c6\u03bf\u03c3\u03cd\u03bd\u03b7\u03c2.", + "text": "Greek (monotonic) Όλοι οι άνθρωποι γεννιούνται ελεύθεροι και ίσοι στην αξιοπρέπεια και τα δικαιώματα. Είναι προικισμένοι με λογική και συνείδηση, και οφείλουν να συμπεριφέρονται μεταξύ τους με πνεύμα αδελφοσύνης.", "metadata": { "languages": [ "ell" @@ -3694,7 +3694,7 @@ { "type": "NarrativeText", "element_id": "ef30df67b6cbf4e05af379e61e529561", - "text": "Greek (polytonic) \u1f4d\u03bb\u03bf\u03b9 \u03bf\u1f31 \u1f04\u03bd\u03b8\u03c1\u03c9\u03c0\u03bf\u03b9 \u03b3\u03b5\u03bd\u03bd\u03b9\u03bf\u1fe6\u03bd\u03c4\u03b1\u03b9 \u1f10\u03bb\u03b5\u1f7b\u03b8\u03b5\u03c1\u03bf\u03b9 \u03ba\u03b1\u1f76 \u1f34\u03c3\u03bf\u03b9 \u03c3\u03c4\u1f74\u03bd \u1f00\u03be\u03b9\u03bf\u03c0\u03c1\u1f73\u03c0\u03b5\u03b9\u03b1 \u03ba\u03b1\u1f76 \u03c4\u1f70 \u03b4\u03b9\u03ba\u03b1\u03b9\u1f7d\u03bc\u03b1\u03c4\u03b1. \u0395\u1f36\u03bd\u03b1\u03b9 \u03c0\u03c1\u03bf\u03b9\u03ba\u03b9\u03c3\u03bc\u1f73\u03bd\u03bf\u03b9 \u03bc\u1f72 \u03bb\u03bf\u03b3\u03b9\u03ba\u1f74 \u03ba\u03b1\u1f76 \u03c3\u03c5\u03bd\u03b5\u1f77\u03b4\u03b7\u03c3\u03b7, \u03ba\u03b1\u1f76 \u1f40\u03c6\u03b5\u1f77\u03bb\u03bf\u03c5\u03bd \u03bd\u1f70 \u03c3\u03c5\u03bc\u03c0\u03b5\u03c1\u03b9\u03c6\u1f73\u03c1\u03bf\u03bd\u03c4\u03b1\u03b9 \u03bc\u03b5\u03c4\u03b1\u03be\u1f7b \u03c4\u03bf\u03c5\u03c2 \u03bc\u1f72 \u03c0\u03bd\u03b5\u1fe6\u03bc\u03b1 \u1f00\u03b4\u03b5\u03bb\u03c6\u03bf\u03c3\u1f7b\u03bd\u03b7\u03c2.", + "text": "Greek (polytonic) Ὅλοι οἱ ἄνθρωποι γεννιοῦνται ἐλεύθεροι καὶ ἴσοι στὴν ἀξιοπρέπεια καὶ τὰ δικαιώματα. Εἶναι προικισμένοι μὲ λογικὴ καὶ συνείδηση, καὶ ὀφείλουν νὰ συμπεριφέρονται μεταξύ τους μὲ πνεῦμα ἀδελφοσύνης.", "metadata": { "languages": [ "ell" @@ -3715,7 +3715,7 @@ { "type": "NarrativeText", "element_id": "a8aaedf9144ce4af4a672873d93945c2", - "text": "Guaran\u00ed, Paraguayan Mayma yvyp\u00f3ra ou ko yvy \u00e1ri i\u00f1apytl\u02bcyre ha ete\u0129cha dignidad ha derecho jeguerek\u00f3pe; ha ikatu rupi oikuaa a\u00f1et\u00e9va ha a\u00f1ete\u02bcyva, ipor\u00e3va ha iva\u00edva, tekotev\u1ebd pehengu\u00e9icha oiko o\u00f1ondiveku\u00e9ra.", + "text": "Guaraní, Paraguayan Mayma yvypóra ou ko yvy ári iñapytlʼyre ha eteĩcha dignidad ha derecho jeguerekópe; ha ikatu rupi oikuaa añetéva ha añeteʼyva, iporãva ha ivaíva, tekotevẽ pehenguéicha oiko oñondivekuéra.", "metadata": { "languages": [ "slk", @@ -3739,7 +3739,7 @@ { "type": "NarrativeText", "element_id": "1a8dccbb2225da58c6c32c944346a88f", - "text": "Guarayu Opakatu ava yoro\u2019a nda\u2019ei tembigwaigwa oyoyatupri, sekotupri, va\u00ebra, imboeteisara, oikatu ipi\u2019a yemo\u00f1eta, imbaekua, ndiyai yurekorairai \u00f1ep\u00ebi p\u00ebi ambua rese.", + "text": "Guarayu Opakatu ava yoro’a nda’ei tembigwaigwa oyoyatupri, sekotupri, vaëra, imboeteisara, oikatu ipi’a yemoñeta, imbaekua, ndiyai yurekorairai ñepëi pëi ambua rese.", "metadata": { "languages": [ "ind", @@ -3761,7 +3761,7 @@ { "type": "NarrativeText", "element_id": "2aff799c80d0ba06e344f3b917c6aa5a", - "text": "Gujarati \u0aaa\u0acd\u0ab0\u0aa4\u0abf\u0ab7\u0acd\u0aa0\u0abe \u0a85\u0aa8\u0ac7 \u0a85\u0aa7\u0abf\u0a95\u0abe\u0ab0\u0acb\u0aa8\u0ac0 \u0aa6\u0ac3\u0ab7\u0acd\u0a9f\u0abf\u0a8f \u0ab8\u0ab0\u0acd\u0ab5 \u0aae\u0abe\u0aa8\u0ab5\u0acb \u0a9c\u0aa8\u0acd\u0aae\u0aa5\u0ac0 \u0ab8\u0acd\u0ab5\u0aa4\u0a82\u0aa4\u0acd\u0ab0 \u0a85\u0aa8\u0ac7 \u0ab8\u0aae\u0abe\u0aa8 \u0ab9\u0acb\u0aaf \u0a9b\u0ac7. \u0aa4\u0ac7\u0aae\u0aa8\u0abe\u0aae\u0abe\u0a82 \u0ab5\u0abf\u0a9a\u0abe\u0ab0\u0ab6\u0a95\u0acd\u0aa4\u0abf \u0a85\u0aa8\u0ac7 \u0a85\u0a82\u0aa4\u0a83\u0a95\u0ab0\u0aa3 \u0ab9\u0acb\u0aaf \u0a9b\u0ac7 \u0a85\u0aa8\u0ac7 \u0aa4\u0ac7\u0aae\u0aa3\u0ac7 \u0aaa\u0ab0\u0ab8\u0acd\u0aaa\u0ab0 \u0aac\u0a82\u0aa7\u0ac1\u0aa4\u0acd\u0ab5\u0aa8\u0ac0 \u0aad\u0abe\u0ab5\u0aa8\u0abe\u0aa5\u0ac0 \u0ab5\u0ab0\u0acd\u0aa4\u0ab5\u0ac1\u0a82 \u0a9c\u0acb\u0a87\u0a8f.", + "text": "Gujarati પ્રતિષ્ઠા અને અધિકારોની દૃષ્ટિએ સર્વ માનવો જન્મથી સ્વતંત્ર અને સમાન હોય છે. તેમનામાં વિચારશક્તિ અને અંતઃકરણ હોય છે અને તેમણે પરસ્પર બંધુત્વની ભાવનાથી વર્તવું જોઇએ.", "metadata": { "languages": [ "guj" @@ -3782,7 +3782,7 @@ { "type": "NarrativeText", "element_id": "7c7879f1335e2e8f7c0ca4a80cb6d9fc", - "text": "Gumuz Dub\ua78caga b\ua78caga metaam metaam alamaam kamaanzaak\ua78coma kas\ua78ce bipok\ua78coga kamad\ua78cab maafuc\ua78cak\ua78cwa haaga bac\ua78caga tso. Ka\u0301b\ua78caga jajanda kwa jala etigafalagash ma\ua78ciiya nago metaagwa eyaal yida-eba bic\ua78caga tso.", + "text": "Gumuz Dubꞌaga bꞌaga metaam metaam alamaam kamaanzaakꞌoma kasꞌe bipokꞌoga kamadꞌab maafucꞌakꞌwa haaga bacꞌaga tso. Kábꞌaga jajanda kwa jala etigafalagash maꞌiiya nago metaagwa eyaal yida-eba bicꞌaga tso.", "metadata": { "languages": [ "som" @@ -3803,7 +3803,7 @@ { "type": "NarrativeText", "element_id": "c591dbcd933d69898871c75fc9b2c5b8", - "text": "Haitian Creole French (Kreyol) Tout moun f\u00e8t lib, egal ego pou diyite kou w\u00e8 dwa. Nou gen la rezon ak la konsyans epi nou f\u00e8t pou nou aji youn ak lot ak yon lespri fwat\u00e8nite.", + "text": "Haitian Creole French (Kreyol) Tout moun fèt lib, egal ego pou diyite kou wè dwa. Nou gen la rezon ak la konsyans epi nou fèt pou nou aji youn ak lot ak yon lespri fwatènite.", "metadata": { "languages": [ "fra" @@ -3824,7 +3824,7 @@ { "type": "NarrativeText", "element_id": "1caef318c81d61c240de817182b5b56b", - "text": "Haitian Creole French (Popular) Tout moun sou t\u00e8 a f\u00e8t tou lib. Tout gen menm val\u00e8 (nan je lasosyete), tout moun gen menm dwa devan Lalwa. Tout moun f\u00e8t ak yon bonsans, tout f\u00e8t ak yon konsyans epi youn f\u00e8t pou trete l\u00f2t tankou fr\u00e8 ak s\u00e8.", + "text": "Haitian Creole French (Popular) Tout moun sou tè a fèt tou lib. Tout gen menm valè (nan je lasosyete), tout moun gen menm dwa devan Lalwa. Tout moun fèt ak yon bonsans, tout fèt ak yon konsyans epi youn fèt pou trete lòt tankou frè ak sè.", "metadata": { "languages": [ "fra", @@ -3869,7 +3869,7 @@ { "type": "NarrativeText", "element_id": "100bdd3a0bc9a25394f34018b95871fe", - "text": "Hausa Duk \u2018yan\u2019adan ana haihuwarsu ne a matsayin \u2018yantattun \u2018ya\u2019ya, kuma mutuncinsu da haqqoqinsu daidai yake da na kowa. Suna da tunani da cikakken hankali, saboda haka ake son duk mu\u2019amalar da za su yi, ta kasance akwai \u2018yan\u2019uwantaka a tsakani.", + "text": "Hausa Duk ‘yan’adan ana haihuwarsu ne a matsayin ‘yantattun ‘ya’ya, kuma mutuncinsu da haqqoqinsu daidai yake da na kowa. Suna da tunani da cikakken hankali, saboda haka ake son duk mu’amalar da za su yi, ta kasance akwai ‘yan’uwantaka a tsakani.", "metadata": { "languages": [ "ind", @@ -3891,7 +3891,7 @@ { "type": "NarrativeText", "element_id": "19ff46e13339eab9d9fce6566dad6102", - "text": "Hausa (Niger) Su dai \u01b4an\u2010adam, ana haifuwarsu ne duka \u01b4antattu, kuma kowannensu na da mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin \u01b4an\u2010uwanci.", + "text": "Hausa (Niger) Su dai ƴan‐adam, ana haifuwarsu ne duka ƴantattu, kuma kowannensu na da mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin ƴan‐uwanci.", "metadata": { "languages": [ "swa", @@ -3913,7 +3913,7 @@ { "type": "NarrativeText", "element_id": "39fce89f870171ba68c60c4aaaeb5509", - "text": "Hausa (Nigeria) Su dai \u2018yan-adam, ana haifuwarsu ne duka \u2018yantattu, kuma kowannensu na da mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin \u2018yan-uwanci.", + "text": "Hausa (Nigeria) Su dai ‘yan-adam, ana haifuwarsu ne duka ‘yantattu, kuma kowannensu na da mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin ‘yan-uwanci.", "metadata": { "languages": [ "ind", @@ -3935,7 +3935,7 @@ { "type": "NarrativeText", "element_id": "5a888adab3cc776c69ebb4b588db4bfb", - "text": "Hawaiian H\u0101nau k\u016b\u2019oko\u2019a \u2018ia n\u0101 k\u0101naka apau loa, a ua kau like ka hanohano a me n\u0101 pono k\u012bvila ma luna o k\u0101kou p\u0101kahi. Ua ku\u2019u mai ka no\u2019ono\u2019o pono a me ka \u2018ike pono ma luna o k\u0101kou, no laila, e aloha k\u0101kou kekahi i kekahi.", + "text": "Hawaiian Hānau kū’oko’a ‘ia nā kānaka apau loa, a ua kau like ka hanohano a me nā pono kīvila ma luna o kākou pākahi. Ua ku’u mai ka no’ono’o pono a me ka ‘ike pono ma luna o kākou, no laila, e aloha kākou kekahi i kekahi.", "metadata": { "languages": [ "swa", @@ -3957,7 +3957,7 @@ { "type": "NarrativeText", "element_id": "9bce25b61dc4faf00ebf9ae5bedd19aa", - "text": "Hebrew \u05db\u05dc \u05d1\u05e0\u05d9 \u05d0\u05d3\u05dd \u05e0\u05d5\u05dc\u05d3\u05d5 \u05d1\u05e0\u05d9 \u05d7\u05d5\u05e8\u05d9\u05df \u05d5\u05e9\u05d5\u05d5\u05d9\u05dd \u05d1\u05e2\u05e8\u05db\u05dd \u05d5\u05d1\u05d6\u05db\u05d5\u05d9\u05d5\u05ea\u05d9\u05d4\u05dd. \u05db\u05d5\u05dc\u05dd \u05d7\u05d5\u05e0\u05e0\u05d5 \u05d1\u05ea\u05d1\u05d5\u05e0\u05d4 \u05d5\u05d1\u05de\u05e6\u05e4\u05d5\u05df, \u05dc\u05e4\u05d9\u05db\u05da \u05d7\u05d5\u05d1\u05d4 \u05e2\u05dc\u05d9\u05d4\u05dd \u05dc\u05e0\u05d4\u05d5\u05d2 \u05d0\u05d9\u05e9 \u05d1\u05e8\u05e2\u05d4\u05d5 \u05d1\u05e8\u05d5\u05d7 \u05e9\u05dc \u05d0\u05d7\u05d5\u05d4.", + "text": "Hebrew כל בני אדם נולדו בני חורין ושווים בערכם ובזכויותיהם. כולם חוננו בתבונה ובמצפון, לפיכך חובה עליהם לנהוג איש ברעהו ברוח של אחוה.", "metadata": { "languages": [ "heb" @@ -3999,7 +3999,7 @@ { "type": "UncategorizedText", "element_id": "8af5d2f7586f72942fcfc21e4f9f0e7e", - "text": "Hindi \u0938\u092d\u0940 \u092e\u0928\u0941\u0937\u094d\u092f\u094b\u0902 \u0915\u094b \u0917\u094c\u0930\u0935 \u0914\u0930 \u0905\u0927\u093f\u0915\u093e\u0930\u094b\u0902 \u0915\u0947 \u092e\u093e\u092e\u0932\u0947 \u092e\u0947\u0902 \u091c\u0928\u094d\u092e\u091c\u093e\u0924 \u0938\u094d\u0935\u0924\u0928\u094d\u0924\u094d\u0930\u0924\u093e \u0914\u0930 \u0938\u092e\u093e\u0928\u0924\u093e \u092a\u094d\u0930\u093e\u092a\u094d\u0924 \u0939\u0948 \u0964 \u0909\u0928\u094d\u0939\u0947\u0902 \u092c\u0941\u0926\u094d\u0927\u093f \u0914\u0930 \u0905\u0928\u094d\u0924\u0930\u093e\u0924\u094d\u092e\u093e \u0915\u0940 \u0926\u0947\u0928 \u092a\u094d\u0930\u093e\u092a\u094d\u0924 \u0939\u0948 \u0914\u0930 \u092a\u0930\u0938\u094d\u092a\u0930 \u0909\u0928\u094d\u0939\u0947\u0902 \u092d\u093e\u0908\u091a\u093e\u0930\u0947 \u0915\u0947 \u092d\u093e\u0935 \u0938\u0947 \u092c\u0930\u094d\u0924\u093e\u0935 \u0915\u0930\u0928\u093e \u091a\u093e\u0939\u093f\u090f \u0964", + "text": "Hindi सभी मनुष्यों को गौरव और अधिकारों के मामले में जन्मजात स्वतन्त्रता और समानता प्राप्त है । उन्हें बुद्धि और अन्तरात्मा की देन प्राप्त है और परस्पर उन्हें भाईचारे के भाव से बर्ताव करना चाहिए ।", "metadata": { "languages": [ "hin" @@ -4020,7 +4020,7 @@ { "type": "NarrativeText", "element_id": "b992780a7e7cfec805b61d50bd3cbb25", - "text": "Hindustani, Sarnami Sab djanne aadj\u00e1di aur barabar paidaa bhail\u00e8n, iddjat aur hak m\u00ea. Ohi djanne ke lage sab ke samadj-boedj aur hierdaai hai aur doesare se sab soemmat s\u00e8, djaane-maane ke chaahin.", + "text": "Hindustani, Sarnami Sab djanne aadjádi aur barabar paidaa bhailèn, iddjat aur hak mê. Ohi djanne ke lage sab ke samadj-boedj aur hierdaai hai aur doesare se sab soemmat sè, djaane-maane ke chaahin.", "metadata": { "languages": [ "est", @@ -4110,7 +4110,7 @@ { "type": "NarrativeText", "element_id": "4113619dd86b7bf65f70dd31f3155ce1", - "text": "Huastec (San Lu\u00eds Potos\u00ed) Patal an inik ani an uxum u wa'tsinal walkadh abal junun\u00fal kin bats'uw an alwa'tal\u00e1b ani ka pidhan in \u00e9y jant'ini' in tomn\u00e1l; in kwa'al in tsalp\u00e1dh ani in k'ay\u00e1' abal kin k'anidha' in juntal.", + "text": "Huastec (San Luís Potosí) Patal an inik ani an uxum u wa'tsinal walkadh abal jununúl kin bats'uw an alwa'taláb ani ka pidhan in éy jant'ini' in tomnál; in kwa'al in tsalpádh ani in k'ayá' abal kin k'anidha' in juntal.", "metadata": { "languages": [ "ind", @@ -4133,7 +4133,7 @@ { "type": "NarrativeText", "element_id": "cec56f0f701b47b7615015993ec87eaa", - "text": "Huastec (Sierra de Otontepec) Kuentsal nap wah-ch\u00ednal tee ti chabal jayechek-i antip wah-ch\u00ednal, b\u00e1 tam\u00e1 maxak a pulik maxak in exlal, jununul an\u00ed ni chap an\u00ed jaxtam ko-yal kip le-nax\u00edn an\u00ed ki k-ana ti ba.", + "text": "Huastec (Sierra de Otontepec) Kuentsal nap wah-chínal tee ti chabal jayechek-i antip wah-chínal, bá tamá maxak a pulik maxak in exlal, jununul aní ni chap aní jaxtam ko-yal kip le-naxín aní ki k-ana ti ba.", "metadata": { "languages": [ "ind", @@ -4180,7 +4180,7 @@ { "type": "NarrativeText", "element_id": "68c1e44b4d3af66e1c5cddb5a8861a91", - "text": "Huitoto, Murui Nana ca\u0268 comuillamona dama ca\u0268 abido it\u0268ca\u0268. Ca\u0268 comuillamona j\u0268a\u0268m\u0268e anamo i\u00f1ed\u0268ca\u0268. Nana daje facaiconi it\u0268ca\u0268. Ab\u0268 ui\u00f1uanona comuid\u0268ca\u0268. Dan\u0268 conin\u0268rie ca\u0268 nabairilla.", + "text": "Huitoto, Murui Nana caɨ comuillamona dama caɨ abido itɨcaɨ. Caɨ comuillamona jɨaɨmɨe anamo iñedɨcaɨ. Nana daje facaiconi itɨcaɨ. Abɨ uiñuanona comuidɨcaɨ. Danɨ coninɨrie caɨ nabairilla.", "metadata": { "languages": [ "ita", @@ -4202,7 +4202,7 @@ { "type": "NarrativeText", "element_id": "35c2ba2ee3067a7d3d5509a2f11f8123", - "text": "Hungarian Minden. emberi l\u00e9ny szabadon sz\u00fcletik \u00e9s egyenl\u0151 m\u00e9lt\u00f3s\u00e1ga \u00e9s joga van. Az emberek, \u00e9sszel \u00e9s lelkiismerettel b\u00edrv\u00e1n, egym\u00e1ssal szemben testv\u00e9ri szellemben kell hogy viseltessenek.", + "text": "Hungarian Minden. emberi lény szabadon születik és egyenlő méltósága és joga van. Az emberek, ésszel és lelkiismerettel bírván, egymással szemben testvéri szellemben kell hogy viseltessenek.", "metadata": { "languages": [ "hun" @@ -4246,7 +4246,7 @@ { "type": "NarrativeText", "element_id": "d1120c74094e3c70d2191f6d40987753", - "text": "Icelandic Hver ma\u00f0ur er borinn frj\u00e1ls og jafn \u00f6\u00f0rum a\u00f0 vir\u00f0ingu og r\u00e9ttindum. Menn eru g\u00e6ddir vitsmunum og samvizku, og ber \u00feeim a\u00f0 breyta br\u00f3\u00f0urlega hverjum vi\u00f0 annan.", + "text": "Icelandic Hver maður er borinn frjáls og jafn öðrum að virðingu og réttindum. Menn eru gæddir vitsmunum og samvizku, og ber þeim að breyta bróðurlega hverjum við annan.", "metadata": { "languages": [ "nor" @@ -4289,7 +4289,7 @@ { "type": "NarrativeText", "element_id": "c061731c2409f1d04154bcb99040df32", - "text": "Idoma \u0118g\u0119 ni modudu ac\u0119 k\u0119c\u0119 nya b\u0119c\u0119 \u0119hehi aa ,hibi \u0119g\u037b ma ac\u0119 duu jonjil\u0119 ipu koc\u0119gba n\u037bc\u0119 c\u0119gba m\u0119ml\u2019ojonjil\u0119 ipu \u037bdah ni yab\u037b \u037bc\u0119 nya. Odudu ac\u0119 kwu \u0452wule ml\u2019ohili otu m\u0119ml\u2019ocai k\u0119la j\u037bc\u0119 \u037bha ni yipu \u037btu \u037bc\u0119 aa, higb\u037b ma \u037bc\u0119 higbo y\u037bda m\u0119ml\u2019 \u037bmpa gunu l\u0119 b\u037bin\u0119 nu ma.", + "text": "Idoma Ęgę ni modudu acę kęcę nya bęcę ęhehi aa ,hibi ęgͻ ma acę duu jonjilę ipu kocęgba nͻcę cęgba męml’ojonjilę ipu ͻdah ni yabͻ ͻcę nya. Odudu acę kwu ђwule ml’ohili otu męml’ocai kęla jͻcę ͻha ni yipu ͻtu ͻcę aa, higbͻ ma ͻcę higbo yͻda męml’ ͻmpa gunu lę bͻinę nu ma.", "metadata": { "languages": [ "swa" @@ -4310,7 +4310,7 @@ { "type": "NarrativeText", "element_id": "c3dc3590b2338d3585c67664e25eb878", - "text": "Igbo A m\u1ee5r\u1ee5 mmad\u1ee5 nile n'ohere nakwa nha anya ugwu na ikike. E nyere ha uche na mm\u1ee5\u1ecd ime ihe ziri ezi nke na ha kwesiri \u1ecbkpaso ibe ha agwa n'obi nwanne na nwanne.", + "text": "Igbo A mụrụ mmadụ nile n'ohere nakwa nha anya ugwu na ikike. E nyere ha uche na mmụọ ime ihe ziri ezi nke na ha kwesiri ịkpaso ibe ha agwa n'obi nwanne na nwanne.", "metadata": { "languages": [ "swa" @@ -4331,7 +4331,7 @@ { "type": "NarrativeText", "element_id": "050a0685e37c5cdf1484af7fb81846c0", - "text": "Ijo, Southeast Kim\u2019 owoumo se, keni bara ki na, pa zimi, ose keni bara kemi. Kim\u2019se ye iroro, mani ikiou nana, enini kim\u2019se dudu tari teme nana weri iyenri.", + "text": "Ijo, Southeast Kim’ owoumo se, keni bara ki na, pa zimi, ose keni bara kemi. Kim’se ye iroro, mani ikiou nana, enini kim’se dudu tari teme nana weri iyenri.", "metadata": { "languages": [ "swa", @@ -4419,7 +4419,7 @@ { "type": "NarrativeText", "element_id": "c08152bc9c1cbc1930714b7051e6100a", - "text": "Inuktitut, Eastern Canadian \u1403\u14c5\u152a\u14d5\u14ab\u1466 \u140a\u14c2\u1585\u144e\u1546\u152a\u14d5\u14ab\u1466 \u1403\u14c5\u14da\u1405\u1550\u14aa\u1455 \u1403\u14f1\u14aa\u1550\u14f1\u1550\u15a2\u144e\u1483 \u140a\u14bb\u14aa\u14d7 \u140a\u153e\u1528\u1405\u1583\u144e\u148c\u1483\u15a2\u144e\u1483 \u14c2\u1550\u14f1\u140a\u1591\u14c2\u1483\u146f\u1466 \u140a\u14bb\u14aa\u14d7 \u1431\u152a\u14d0\u14c7\u1403\u144e\u144e\u148d\u1466. \u1403\u14f1\u1583\u1585\u1450\u1581\u144e\u1583\u1550\u144e\u1455\u1405\u1559\u14d5\u1550\u1433\u1466 \u1431\u153e\u152a\u144e\u1583\u1550\u14c2\u1483\u146f\u1466 \u1583\u1455\u1673\u144e\u148c\u1466\u144e\u140a\u1546\u140a\u1583\u1550\u14c2\u1483\u146f\u14ea\u14d7.", + "text": "Inuktitut, Eastern Canadian ᐃᓅᔪᓕᒫᑦ ᐊᓂᖅᑎᕆᔪᓕᒫᑦ ᐃᓅᓚᐅᕐᒪᑕ ᐃᓱᒪᕐᓱᕐᖢᑎᒃ ᐊᒻᒪᓗ ᐊᔾᔨᐅᖃᑎᒌᒃᖢᑎᒃ ᓂᕐᓱᐊᖑᓂᒃᑯᑦ ᐊᒻᒪᓗ ᐱᔪᓐᓇᐃᑎᑎᒍᑦ. ᐃᓱᖃᖅᑐᖁᑎᖃᕐᑎᑕᐅᕙᓕᕐᐳᑦ ᐱᔾᔪᑎᖃᕐᓂᒃᑯᑦ ᖃᑕᙳᑎᒌᑦᑎᐊᕆᐊᖃᕐᓂᒃᑯᓪᓗ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -4458,7 +4458,7 @@ { "type": "NarrativeText", "element_id": "6e8030f949832ac1e4d5632bc1a06b48", - "text": "Italian Tutti gli esseri umani nascono liberi ed eguali in dignit\u00e0 e diritti. Essi sono dotati di ragione e di coscienza e devono agire gli uni verso gli altri in spirito di fratellanza.", + "text": "Italian Tutti gli esseri umani nascono liberi ed eguali in dignità e diritti. Essi sono dotati di ragione e di coscienza e devono agire gli uni verso gli altri in spirito di fratellanza.", "metadata": { "languages": [ "ita" @@ -4500,7 +4500,7 @@ { "type": "Title", "element_id": "57bbff46bb89b26b933206afe0fd8904", - "text": "\u3059\u3079\u3066\u306e\u4eba\u9593\u306f\u3001\u751f\u307e\u308c\u306a\u304c\u3089\u306b\u3057\u3066\u81ea\u7531\u3067\u3042\u308a\u3001\u304b\u3064\u3001\u5c0a\u53b3\u3068\u6a29\u5229\u3068\u306b\u3064\u3044\u3066\u5e73\u7b49\u3067\u3042\u308b\u3002\u4eba\u9593\u306f\u3001\u7406\u6027\u3068\u826f\u5fc3\u3068\u3092\u6388\u3051\u3089\u308c\u3066\u304a\u308a\u3001\u4e92\u3044\u306b\u540c\u80de\u306e\u7cbe\u795e\u3092\u3082\u3063\u3066\u884c\u52d5\u3057\u306a\u3051\u308c\u3070\u306a\u3089\u306a\u3044\u3002", + "text": "すべての人間は、生まれながらにして自由であり、かつ、尊厳と権利とについて平等である。人間は、理性と良心とを授けられており、互いに同胞の精神をもって行動しなければならない。", "metadata": { "languages": [ "jpn" @@ -4542,7 +4542,7 @@ { "type": "Title", "element_id": "11becf872133958b85928710255eb2cc", - "text": "\u3059\u3079\u3066\u306e\u4eba\u9593\u306f\u3001\u751f\u307e\u308c\u306a\u304c\u3089\u306b\u3057\u3066\u81ea\u7531\u3084\u3057\u3001\u304b\u3064\u3001\u5c0a\u53b3\u3068\u6a29\u5229\u3068\u306b\u3064\u3044\u3066\u5e73\u7b49\u3084\u3002\u4eba\u9593\u306f\u3001\u7406\u6027\u3068\u826f\u5fc3\u3068\u3092\u6388\u3051\u3089\u308c\u3066\u304a\u308a\u3001\u4e92\u3044\u306b\u540c\u80de\u306e\u7cbe\u795e\u3092\u3082\u3063\u3066\u884c\u52d5\u3057\u306a\u3002", + "text": "すべての人間は、生まれながらにして自由やし、かつ、尊厳と権利とについて平等や。人間は、理性と良心とを授けられており、互いに同胞の精神をもって行動しな。", "metadata": { "languages": [ "jpn" @@ -4584,7 +4584,7 @@ { "type": "Title", "element_id": "491550640c5496ae9b9e41b4c6cc14f0", - "text": "\u5168\u90e8\u306e\u4eba\u9593\u306f\u3001\u751f\u307e\u308c\u306a\u304c\u3089\u306b\u3057\u3066\u81ea\u7531\u3067\u3042\u308a\u3001\u304b\u3064\u3001\u5c0a\u53b3\u3068\u6a29\u5229\u3068 \u306b\u3064\u3044\u3066\u5e73\u7b49\u3067\u3042\u308b\u3002\u4eba\u9593\u306f\u3001\u7406\u6027\u3068\u826f\u5fc3\u3068\u3092\u6388\u3051\u3089\u308c\u3066\u304a\u308a\u3001\u4e92\u3044\u306b\u540c \u80de\u306e\u7cbe\u795e\u3092\u3082\u3063\u3066\u884c\u52d5\u3057\u306a\u3051\u308c\u3070\u306a\u3089\u306a\u3044\u3002", + "text": "全部の人間は、生まれながらにして自由であり、かつ、尊厳と権利と について平等である。人間は、理性と良心とを授けられており、互いに同 胞の精神をもって行動しなければならない。", "metadata": { "languages": [ "jpn" @@ -4626,7 +4626,7 @@ { "type": "Title", "element_id": "36abfab21253834165ada6ce4b89b5e6", - "text": "\ua9cb\ua9b1\ua9a7\ua9bc\ua9a4\ua9c0\ua9b2\ua9b8\ua9ae\ua9ba\ua9b4\ua981\ua98f\ua9ad\ua9b2\ua9b6\ua982\ua9ab\ua98f\ua9ba\ua98f\ua9a4\ua9c0\ua99b\ua9b6\ua9a9\ua982\ua9a2\ua9b6\ua98f\ua9ad\ua9a4\ua9c0\ua9a2\ua982\ua9a7\ua9ba\ua9a9\ua982\ua9a0\ua9a7\ua9a0\ua9c0\ua9ad\ua9a4\ua9c0\ua9b2\ua98f\ua9c0\ua9b2\ua98f\ua9c0\ua98f\ua981\ua9a5\ua99d\ua9c9\u200b\ua98f\ua9a7\ua9ba\ua983\ua9a5\ua9b6\ua9a4\ua9ab\ua9b6\ua981\ua994\ua9a4\ua9c0\ua9b2\ua98f\ua9ad\ua9c0\ua9ad\ua9a4\ua9c0\ua98f\ua9ad\ua9c0\ua9a7\ua9b8\ua9b1\ua982\ua9a0\ua98f\ua9b2\ua997\ua9a7\ua9c0\ua9a5\ua9b1\ua9bf\ua9ae\ua9b8\ua981\ua994\ua9a4\ua9c0\ua9b2\ua981\ua992\ua9ba\ua9b4\ua9a4\ua9c0\ua9a4\ua9ba\ua9a9\ua9bc\ua9a9\ua9b6\ua9a0\ua9bf\ua9a4\ua9c0\ua9b1\ua9b6\ua997\ua9b6\ua9ad\ua9a4\ua9c0\ua9b1\ua9b6\ua997\ua9b6\ua9a4\ua9ba\ua98f\ua9a4\ua9c0\ua99b\ua9b6\ua997\ua9b6\ua9ae\ua9ba\ua9b4\ua9b1\ua9b8\ua9a9\ua9a2\ua9b8\ua9ad\ua9b8\ua982\ua9c9\u200b", + "text": "꧋ꦱꦧꦼꦤ꧀ꦲꦸꦮꦺꦴꦁꦏꦭꦲꦶꦂꦫꦏꦺꦏꦤ꧀ꦛꦶꦩꦂꦢꦶꦏꦭꦤ꧀ꦢꦂꦧꦺꦩꦂꦠꦧꦠ꧀ꦭꦤ꧀ꦲꦏ꧀ꦲꦏ꧀ꦏꦁꦥꦝ꧉​ꦏꦧꦺꦃꦥꦶꦤꦫꦶꦁꦔꦤ꧀ꦲꦏꦭ꧀ꦭꦤ꧀ꦏꦭ꧀ꦧꦸꦱꦂꦠꦏꦲꦗꦧ꧀ꦥꦱꦿꦮꦸꦁꦔꦤ꧀ꦲꦁꦒꦺꦴꦤ꧀ꦤꦺꦩꦼꦩꦶꦠꦿꦤ꧀ꦱꦶꦗꦶꦭꦤ꧀ꦱꦶꦗꦶꦤꦺꦏꦤ꧀ꦛꦶꦗꦶꦮꦺꦴꦱꦸꦩꦢꦸꦭꦸꦂ꧉​", "metadata": { "filetype": "text/plain", "data_source": { @@ -4665,7 +4665,7 @@ { "type": "NarrativeText", "element_id": "6b54f0a53f2c7bb4545835a761d4654b", - "text": "Jola-Fonyi Bukanak b\u00farom nan kuwolimi kurere kererer di waafaw b\u00farom. Kubabaj poop b\u00fayejet di karampenoor.", + "text": "Jola-Fonyi Bukanak búrom nan kuwolimi kurere kererer di waafaw búrom. Kubabaj poop búyejet di karampenoor.", "metadata": { "languages": [ "ind", @@ -4687,7 +4687,7 @@ { "type": "NarrativeText", "element_id": "b2c33dfdb2855a8786e1145a6dbbedc2", - "text": "Jula W\u00f3lo\u2019 l\u00e1, h\u00e1damaden\u2019 b\u025b\u025b ye h\u0254r\u0254n ye, b\u025b\u025b k\u00e1 k\u00e1n l\u00e0nbe n\u00ed h\u00e1k\u025byaw l\u00e1. M\u0254g\u0254 b\u025b\u025b ye h\u00e1kilitigi ye, b\u025b\u025b ye h\u00e1kilima ye ; \u00f2 l\u00e0, \u00f9 k\u00e1 k\u00e1n k\u00e0 \u0272g\u0254n m\u00edna n\u00ed b\u00e1denya ye.", + "text": "Jula Wólo’ lá, hádamaden’ bɛɛ ye hɔrɔn ye, bɛɛ ká kán lànbe ní hákɛyaw lá. Mɔgɔ bɛɛ ye hákilitigi ye, bɛɛ ye hákilima ye ; ò là, ù ká kán kà ɲgɔn mína ní bádenya ye.", "metadata": { "languages": [ "hun", @@ -4732,7 +4732,7 @@ { "type": "NarrativeText", "element_id": "03b0bbddb1137224b43b690dfcc5b506", - "text": "Kabardian \u0426\u04cf\u044b\u0445\u0443 \u043f\u0441\u043e\u0440\u0438 \u0449\u0445\u044c\u044d\u0445\u0443\u0438\u0442\u0443, \u044f \u0449\u04cf\u044b\u0445\u044c\u044b\u043c\u0440\u044d \u044f \u0445\u0443\u044d\u0444\u0430\u0449\u044d\u0445\u044d\u043c\u0440\u044d\u043a\u04cf\u044d \u0437\u044d\u0445\u0443\u044d\u0434\u044d\u0443 \u043a\u044a\u0430\u043b\u044a\u0445\u0443\u0440. \u0410\u043a\u044a\u044b\u043b\u0440\u044d \u0437\u044d\u0445\u044d\u0449\u04cf\u044b\u043a\u04cf \u0433\u044a\u0443\u0430\u0437\u044d\u0440\u044d \u044f\u04cf\u044d\u0449\u0438, \u0437\u044b\u0440 \u0437\u044b\u043c \u0437\u044d\u043a\u044a\u0443\u044d\u0448 \u0437\u044d\u0445\u0430\u0449\u0406\u044d \u044f\u043a\u0443 \u0434\u044d\u043b\u044a\u0443 \u0437\u044d\u0445\u0443\u0449\u044b\u0442\u044b\u043d \u0445\u0443\u0435\u0439\u0445\u044d\u0449.", + "text": "Kabardian Цӏыху псори щхьэхуиту, я щӏыхьымрэ я хуэфащэхэмрэкӏэ зэхуэдэу къалъхур. Акъылрэ зэхэщӏыкӏ гъуазэрэ яӏэщи, зыр зым зэкъуэш зэхащІэ яку дэлъу зэхущытын хуейхэщ.", "metadata": { "languages": [ "rus" @@ -4753,7 +4753,7 @@ { "type": "NarrativeText", "element_id": "5da5e2f597a0e6fce26a5359c72395b3", - "text": "Kabiy\u00e9 Pal\u028al\u028a\u028a \u025byaaa n\u025b pa-t\u0269 y\u0254\u0254 w\u025b\u028a kpaagbaa n\u025b p\u025bw\u025b\u025b k\u0269ma\u014b wala \u025bs\u0269ndaa. Pal\u028al\u028a\u028a-w\u025b n\u025b p\u0254-l\u0254\u014b n\u025b pa-ma\u0263z\u0269m; mb\u028a yekina n\u025b p\u0254s\u0254\u0254l\u0269 \u0256ama se p\u025bk\u025b \u025byaa pa-t\u0269\u014bg\u025b.", + "text": "Kabiyé Palʊlʊʊ ɛyaaa nɛ pa-tɩ yɔɔ wɛʊ kpaagbaa nɛ pɛwɛɛ kɩmaŋ wala ɛsɩndaa. Palʊlʊʊ-wɛ nɛ pɔ-lɔŋ nɛ pa-maɣzɩm; mbʊ yekina nɛ pɔsɔɔlɩ ɖama se pɛkɛ ɛyaa pa-tɩŋgɛ.", "metadata": { "languages": [ "tgl" @@ -4774,7 +4774,7 @@ { "type": "NarrativeText", "element_id": "b1298a59ae52d3a285db4b52acce1f32", - "text": "Kabuverdianu Tudo ser humano na \u00eas mundo nac\u00ea libri e igual na s\u00ea dignidade e na s\u00eas dr\u00eato. Na s\u00eas razon e na s\u00eas conc\u00e9n\u00e7a, tudo arguem deb\u00ea porc\u00ead\u00ea pa co tudo guenti na sprito di fraternidadi.", + "text": "Kabuverdianu Tudo ser humano na ês mundo nacê libri e igual na sê dignidade e na sês drêto. Na sês razon e na sês concénça, tudo arguem debê porcêdê pa co tudo guenti na sprito di fraternidadi.", "metadata": { "languages": [ "por" @@ -4816,7 +4816,7 @@ { "type": "NarrativeText", "element_id": "f6f8a776d36f4db6ffdd50e83fee6488", - "text": "Kannada \u0c8e\u0cb2\u0ccd\u0cb2\u0cbe \u0cae\u0cbe\u0ca8\u0cb5\u0cb0\u0cc2 \u0cb8\u0ccd\u0cb5\u0ca4\u0c82\u0ca4\u0ccd\u0cb0\u0cb0\u0cbe\u0c97\u0cbf\u0caf\u0cc7 \u0c9c\u0ca8\u0cbf\u0cb8\u0cbf\u0ca6\u0ccd\u0ca6\u0cbe\u0cb0\u0cc6. \u0cb9\u0cbe\u0c97\u0cc2 \u0c98\u0ca8\u0ca4\u0cc6 \u0cae\u0ca4\u0ccd\u0ca4\u0cc1 \u0cb9\u0c95\u0ccd\u0c95\u0cc1\u0c97\u0cb3\u0cb2\u0ccd\u0cb2\u0cbf \u0cb8\u0cae\u0cbe\u0ca8\u0cb0\u0cbe\u0c97\u0cbf\u0ca6\u0ccd\u0ca6\u0cbe\u0cb0\u0cc6. \u0cb5\u0cbf\u0cb5\u0cc7\u0c95 \u0cae\u0ca4\u0ccd\u0ca4\u0cc1 \u0c85\u0c82\u0ca4\u0c83\u0c95\u0cb0\u0ca3\u0c97\u0cb3\u0ca8\u0ccd\u0ca8\u0cc1 \u0caa\u0ca1\u0cc6\u0ca6\u0cb5\u0cb0\u0cbe\u0ca6\u0ccd\u0ca6\u0cb0\u0cbf\u0c82\u0ca6 \u0c85\u0cb5\u0cb0\u0cc1 \u0caa\u0cb0\u0cb8\u0ccd\u0caa\u0cb0 \u0cb8\u0cb9\u0ccb\u0ca6\u0cb0 \u0cad\u0cbe\u0cb5\u0ca6\u0cbf\u0c82\u0ca6 \u0cb5\u0cb0\u0ccd\u0ca4\u0cbf\u0cb8\u0cac\u0cc7\u0c95\u0cc1.", + "text": "Kannada ಎಲ್ಲಾ ಮಾನವರೂ ಸ್ವತಂತ್ರರಾಗಿಯೇ ಜನಿಸಿದ್ದಾರೆ. ಹಾಗೂ ಘನತೆ ಮತ್ತು ಹಕ್ಕುಗಳಲ್ಲಿ ಸಮಾನರಾಗಿದ್ದಾರೆ. ವಿವೇಕ ಮತ್ತು ಅಂತಃಕರಣಗಳನ್ನು ಪಡೆದವರಾದ್ದರಿಂದ ಅವರು ಪರಸ್ಪರ ಸಹೋದರ ಭಾವದಿಂದ ವರ್ತಿಸಬೇಕು.", "metadata": { "languages": [ "kan" @@ -4837,7 +4837,7 @@ { "type": "NarrativeText", "element_id": "2600735e35ce8a6dc8243d2269bbeee5", - "text": "Kanuri, Central Adamgana woso kambe katambo ye daraja-a hakkiwa-ason kalkalye. Hankal-a nazaru-asoro k\u0259z\u0259pk\u0259 ye suro hal n\u0259mharamiben kamazasoga letaiyin ye.", + "text": "Kanuri, Central Adamgana woso kambe katambo ye daraja-a hakkiwa-ason kalkalye. Hankal-a nazaru-asoro kəzəpkə ye suro hal nəmharamiben kamazasoga letaiyin ye.", "metadata": { "languages": [ "swa", @@ -4884,7 +4884,7 @@ { "type": "NarrativeText", "element_id": "2e5fe352907c2d71abf3a0283032775f", - "text": "Kaqchikel, Central Konojel ri winaqi' kan kalaxib'en pe ri kolotaj\u00efk, ri junan kiq'ij, ri junan kejqalen, junan kich'ojib'al pa kik'aslen, xa achi'el k'a ri kik'ojlen, ri kinojib'al kichajin xa tik'amun k'a chi nimal\u00e4j konojel xtikajo' ki'.", + "text": "Kaqchikel, Central Konojel ri winaqi' kan kalaxib'en pe ri kolotajïk, ri junan kiq'ij, ri junan kejqalen, junan kich'ojib'al pa kik'aslen, xa achi'el k'a ri kik'ojlen, ri kinojib'al kichajin xa tik'amun k'a chi nimaläj konojel xtikajo' ki'.", "metadata": { "languages": [ "slv", @@ -4907,7 +4907,7 @@ { "type": "NarrativeText", "element_id": "23d27d0652af0739dbaa674e88fc9ae4", - "text": "Karakalpak \u04b2\u04d9\u043c\u043c\u0435 \u0430\u0434\u0430\u043c\u043b\u0430\u0440 \u04e9\u0437 \u049b\u04d9\u0434\u0438\u0440-\u049b\u044b\u043c\u0431\u0430\u0442\u044b \u0436\u04d9\u043d\u0435 \u04b3\u0443\u049b\u044b\u049b\u043b\u0430\u0440\u044b\u043d\u0434\u0430 \u0435\u0440\u043a\u0438\u043d \u04b3\u04d9\u043c \u0442\u0435\u04a3 \u0431\u043e\u043b\u044b\u043f \u0442\u0443\u045e\u044b\u043b\u0430\u0434\u044b. \u041e\u043b\u0430\u0440\u0493\u0430 \u0430\u049b\u044b\u043b \u04b3\u04d9\u043c \u04b3\u04af\u0436\u0434\u0430\u043d \u0431\u0435\u0440\u0438\u043b\u0433\u0435\u043d \u0431\u043e\u043b\u044b\u043f, \u0431\u0438\u0440-\u0431\u0438\u0440\u0438\u043d\u0435 \u0442\u0443\u045e\u044b\u0441\u049b\u0430\u043d\u043b\u044b\u049b \u0440\u0443\u045e\u0445\u044b\u043d\u0434\u0430\u0493\u044b \u049b\u0430\u0442\u043d\u0430\u0441\u0442\u0430 \u0431\u043e\u043b\u044b\u045e\u044b \u0442\u0438\u0439\u0438\u0441.", + "text": "Karakalpak Ҳәмме адамлар өз қәдир-қымбаты және ҳуқықларында еркин ҳәм тең болып туўылады. Оларға ақыл ҳәм ҳүждан берилген болып, бир-бирине туўысқанлық руўхындағы қатнаста болыўы тийис.", "metadata": { "languages": [ "rus" @@ -4928,7 +4928,7 @@ { "type": "NarrativeText", "element_id": "c6f580433e84639a19b178da5dc4b3a2", - "text": "Karelian Kai rahvas roittahes v\u00e4llinny da taza-arvozinnu omas arvos da oigevuksis. Jogahizele heis on annettu mieli da omatundo da heil v\u00e4lt\u00e4m\u00e4tt\u00e4h pid\u00e4y olla kesken\u00e4h, kui vellil.", + "text": "Karelian Kai rahvas roittahes vällinny da taza-arvozinnu omas arvos da oigevuksis. Jogahizele heis on annettu mieli da omatundo da heil vältämättäh pidäy olla keskenäh, kui vellil.", "metadata": { "languages": [ "est", @@ -4950,7 +4950,7 @@ { "type": "NarrativeText", "element_id": "87e368f61c4a1ba6e0a5743d4d2d41b2", - "text": "Kasem Ba loge n\u0254\u0254na maama se ba taa ye bedwe mo ba \u014bwea de ba chega seini, ye fefeo teira k\u0254taa. W\u025b p\u025b ba swa de bobo\u014ba mo se ba taa ye nubiu daane ye ba jege da \u014bwa\u014ba.", + "text": "Kasem Ba loge nɔɔna maama se ba taa ye bedwe mo ba ŋwea de ba chega seini, ye fefeo teira kɔtaa. Wɛ pɛ ba swa de boboŋa mo se ba taa ye nubiu daane ye ba jege da ŋwaŋa.", "metadata": { "languages": [ "som", @@ -4972,7 +4972,7 @@ { "type": "NarrativeText", "element_id": "1908a740d8aedadb521f39432a6cbed8", - "text": "Kazakh \u0411\u0430\u0440\u043b\u044b\u049b \u0430\u0434\u0430\u043c\u0434\u0430\u0440 \u0442\u0443\u043c\u044b\u0441\u044b\u043d\u0430\u043d \u0430\u0437\u0430\u0442 \u0436\u04d9\u043d\u0435 \u049b\u0430\u0434\u0456\u0440\u2010\u049b\u0430\u0441\u0438\u0435\u0442\u0456 \u043c\u0435\u043d \u043a\u04b1\u049b\u044b\u049b\u0442\u0430\u0440\u044b \u0442\u0435\u04a3 \u0431\u043e\u043b\u044b\u043f \u0434\u04af\u043d\u0438\u0435\u0433\u0435 \u043a\u0435\u043b\u0435\u0434\u0456. \u0410\u0434\u0430\u043c\u0434\u0430\u0440\u0493\u0430 \u0430\u049b\u044b\u043b\u2010\u043f\u0430\u0440\u0430\u0441\u0430\u0442, \u0430\u0440\u2010\u043e\u0436\u0434\u0430\u043d \u0431\u0435\u0440\u0456\u043b\u0433\u0435\u043d, \u0441\u043e\u043d\u0434\u044b\u049b\u0442\u0430\u043d \u043e\u043b\u0430\u0440 \u0431\u0456\u0440\u2010\u0431\u0456\u0440\u0456\u043c\u0435\u043d \u0442\u0443\u044b\u0441\u0442\u044b\u049b, \u0431\u0430\u0443\u044b\u0440\u043c\u0430\u043b\u0434\u044b\u049b \u049b\u0430\u0440\u044b\u043c\u2010\u049b\u0430\u0442\u044b\u043d\u0430\u0441 \u0436\u0430\u0441\u0430\u0443\u043b\u0430\u0440\u044b \u0442\u0438\u0456\u0441.", + "text": "Kazakh Барлық адамдар тумысынан азат және қадір‐қасиеті мен кұқықтары тең болып дүниеге келеді. Адамдарға ақыл‐парасат, ар‐ождан берілген, сондықтан олар бір‐бірімен туыстық, бауырмалдық қарым‐қатынас жасаулары тиіс.", "metadata": { "languages": [ "ukr", @@ -4994,7 +4994,7 @@ { "type": "NarrativeText", "element_id": "75b6a6751bcdf3ddfc1745d8e7118815", - "text": "Khakas \u041f\u043e\u043b\u0493\u0430\u043d \u043d\u0430 \u043a\u0456\u0437\u0456 \u043f\u043e\u0441 \u043f\u0430\u0437\u0430 \u0442\u0438\u04a3 \u0442\u04e7\u0440\u0456\u043f\u0447\u0435 \u043f\u0430\u0437\u0430 \u0442\u0438\u04a3 \u043f\u043e\u0441\u0442\u044b\u04a3 \u0441\u0438\u043d\u0456\u043d \u043f\u0456\u043b\u0456\u043d\u0433\u0435\u043d\u0456\u043d \u043f\u0430\u0437\u0430 \u0442\u04e7\u0440\u0435\u043b\u0435\u0440\u0456\u043d\u0456\u04a3\u0434\u0435 \u043f\u043e\u043b\u0447\u0430. \u041e\u043b\u0430\u0440\u0434\u044b\u04a3 \u0441\u0430\u0493\u044b\u043d\u0493\u0430\u043d\u044b \u043f\u0430\u0437\u0430 \u0430\u0440\u044b\u0493 \u0441\u0430\u0493\u044b\u0441 \u043f\u0430\u0440 \u043f\u0430\u0437\u0430 \u0445\u0430\u0440\u044b\u043d\u0434\u0430\u0441\u0442\u0430\u0440 \u0447\u0456\u043b\u0438 \u0442\u0443\u0434\u044b\u043d\u0430\u0440\u0493\u0430 \u043a\u0438\u0440\u0435\u043a\u0442\u0435\u0440.", + "text": "Khakas Полған на кізі пос паза тиң тӧріпче паза тиң постың синін пілінгенін паза тӧрелерініңде полча. Олардың сағынғаны паза арығ сағыс пар паза харындастар чіли тудынарға киректер.", "metadata": { "languages": [ "ukr", @@ -5016,7 +5016,7 @@ { "type": "NarrativeText", "element_id": "74a93facd90bf0553bdf368698baa2a5", - "text": "Khasi \u00cfa ki bynriew baroh la kha laitluid bad ki \u00efaryngkat ha ka burom bad ki hok. Ha ki la bsiap da ka bor pyrkhat bad ka jing\u00efatiplem bad ha ka mynsiem jingsngew shipara ki dei ban \u00efatrei bynrap lang.", + "text": "Khasi Ïa ki bynriew baroh la kha laitluid bad ki ïaryngkat ha ka burom bad ki hok. Ha ki la bsiap da ka bor pyrkhat bad ka jingïatiplem bad ha ka mynsiem jingsngew shipara ki dei ban ïatrei bynrap lang.", "metadata": { "languages": [ "ind", @@ -5038,7 +5038,7 @@ { "type": "Title", "element_id": "b6ab4d5f0569e217cd985de6b9f5ca73", - "text": "Khmer, Central \u1798\u1793\u17bb\u179f\u17d2\u179f\u1791\u17b6\u17c6\u1784\u17a2\u179f\u17cb \u1780\u17be\u178f\u1798\u1780\u1798\u17b6\u1793\u179f\u17c1\u179a\u17b8\u1797\u17b6\u1796 \u1793\u17b7\u1784\u179f\u1798\u1797\u17b6\u1796 \u1780\u17d2\u1793\u17bb\u1784\u1795\u17d2\u1793\u17c2\u1780\u179f\u17c1\u1785\u1780\u17d2\u178a\u17b8\u1790\u17d2\u179b\u17c3\u1790\u17d2\u1793\u17bc\u179a\u1793\u17b7\u1784\u179f\u17b7\u1791\u17d2\u1792\u17b7\u17d4 \u1798\u1793\u17bb\u179f\u17d2\u179f \u1798\u17b6\u1793\u179c\u17b7\u1785\u17b6\u179a\u178e\u1789\u17d2\u1789\u17b6\u178e\u1793\u17b7\u1784\u179f\u178f\u17b7\u179f\u1798\u17d2\u1794\u1787\u1789\u17d2\u1789\u17c8\u1787\u17b6\u1794\u17cb\u1796\u17b8\u1780\u17c6\u178e\u17be\u178f \u17a0\u17be\u1799\u1782\u1794\u17d2\u1794\u17b8\u1794\u17d2\u179a\u1796\u17d2\u179a\u17b9\u178f\u17d2\u178a\u1785\u17c6\u1796\u17c4\u17c7\u1782\u17d2\u1793\u17b6\u1791\u17c5\u179c\u17b7\u1789\u1791\u17c5\u1798\u1780\u1780\u17d2\u1793\u17bb\u1784\u179f\u17d2\u1798\u17b6\u179a\u178f\u17b8\u1797\u17b6\u178f\u179a\u1797\u17b6\u1796\u1787\u17b6\u1794\u1784\u1794\u17d2\u17a2\u17bc\u1793\u17d4", + "text": "Khmer, Central មនុស្សទាំងអស់ កើតមកមានសេរីភាព និងសមភាព ក្នុងផ្នែកសេចក្ដីថ្លៃថ្នូរនិងសិទ្ធិ។ មនុស្ស មានវិចារណញ្ញាណនិងសតិសម្បជញ្ញៈជាប់ពីកំណើត ហើយគប្បីប្រព្រឹត្ដចំពោះគ្នាទៅវិញទៅមកក្នុងស្មារតីភាតរភាពជាបងប្អូន។", "metadata": { "filetype": "text/plain", "data_source": { @@ -5056,7 +5056,7 @@ { "type": "Title", "element_id": "841467ed91005c2b65ccce68e9bac719", - "text": "Kh\u00fcn \u1a3e\u1a36\u1a69\u1a54\u1a7c\u1a34\u1a60\u1a26\u1a62\u1a49\u1a56\u1a63\u1a60\u1a3f\u1a20\u1a6e\u1a60\u1a2f\u1a68\u1a3e\u1a63\u1a3e\u1a66\u1a3b\u1a60\u1a26\u1a48\u1a41\u1a53\u1a62\u1a39\u1a60\u1a3f\u1a75\u1a26\u1a3b\u1a60\u1a3f\u1a26\u1a20\u1a60\u1a36\u1a62 \u1a36\u1a71\u1a20\u1a65\u1a32\u1a60\u1a32\u1a65\u1a48\u1a60\u1a20\u1a62 \u1a53\u1a62\u1a48\u1a65\u1a34\u1a60\u1a35\u1a65 \u1a32\u1a75\u1a63\u1a60\u1a26\u1a23\u1a73\u1a76\u1a23\u1a62\u1a3e\u1a66\u1a3e\u1a36\u1a6e\u1a63\u1a35\u1a3e\u1a60\u1a3e\u1a7c\u1a53\u1a62 \u1a23\u1a60\u1a45\u1a41\u1a37\u1a2d\u1a65\u1a37\u1a60\u1a32\u1a62\u1a32\u1a73\u1a75\u1a20\u1a60\u1a36\u1a62\u1a2f\u1a62\u1a60\u1a45\u1a60\u1a3f\u1a23\u1a60\u1a45\u1a63\u1a60\u1a3e\u1a39\u1a60\u1a3f\u1a75\u1a26\u1a3b\u1a60\u1a3f\u1a26\u1a20\u1a60\u1a36\u1a62", + "text": "Khün ᨾᨶᩩᩔ᩼ᨴ᩠ᨦᩢᩉᩖᩣ᩠ᨿᨠᩮ᩠ᨯᩨᨾᩣᨾᩦᨻ᩠ᨦᩈᩁᩓᩢᨹ᩠ᨿ᩵ᨦᨻ᩠ᨿᨦᨠ᩠ᨶᩢ ᨶᩱᨠᩥᨲ᩠ᨲᩥᩈ᩠ᨠᩢ ᩓᩢᩈᩥᨴ᩠ᨵᩥ ᨲ᩵ᩣ᩠ᨦᨣᩳ᩶ᨣᩢᨾᩦᨾᨶᩮᩣᨵᨾ᩠ᨾ᩼ᩓᩢ ᨣ᩠ᩅᩁᨷᨭᩥᨷ᩠ᨲᩢᨲᩳ᩵ᨠ᩠ᨶᩢᨯᩢ᩠ᩅ᩠ᨿᨣ᩠ᩅᩣ᩠ᨾᨹ᩠ᨿ᩵ᨦᨻ᩠ᨿᨦᨠ᩠ᨶᩢ", "metadata": { "languages": [ "tur" @@ -5077,7 +5077,7 @@ { "type": "NarrativeText", "element_id": "7abc18c11be0eb0d9f9526fbe76af972", - "text": "Kirghiz \u0411\u0430\u0440\u0434\u044b\u043a \u0430\u0434\u0430\u043c\u0434\u0430\u0440 \u04e9\u0437 \u0431\u0435\u0434\u0435\u043b\u0438\u043d\u0434\u0435 \u0436\u0430\u043d\u0430 \u0443\u043a\u0443\u043a\u0442\u0430\u0440\u044b\u043d\u0434\u0430 \u044d\u0440\u043a\u0438\u043d \u0436\u0430\u043d\u0430 \u0442\u0435\u04a3 \u0443\u043a\u0443\u043a\u0442\u0443\u0443 \u0431\u043e\u043b\u0443\u043f \u0436\u0430\u0440\u0430\u043b\u0430\u0442. \u0410\u043b\u0430\u0440\u0434\u044b\u043d \u0430\u04a3\u2010\u0441\u0435\u0437\u0438\u043c\u0438 \u043c\u0435\u043d\u0435\u043d \u0430\u0431\u0438\u0439\u0438\u0440\u0438 \u0431\u0430\u0440 \u0436\u0430\u043d\u0430 \u0431\u0438\u0440\u0438\u2010\u0431\u0438\u0440\u0438\u043d\u0435 \u0431\u0438\u0440 \u0442\u0443\u0443\u0433\u0430\u043d\u0434\u044b\u043a \u043c\u0430\u043c\u0438\u043b\u0435\u043a\u044b\u043b\u0443\u0443\u0433\u0430 \u0442\u0438\u0439\u0438\u0448.", + "text": "Kirghiz Бардык адамдар өз беделинде жана укуктарында эркин жана тең укуктуу болуп жаралат. Алардын аң‐сезими менен абийири бар жана бири‐бирине бир туугандык мамилекылууга тийиш.", "metadata": { "languages": [ "rus", @@ -5099,7 +5099,7 @@ { "type": "NarrativeText", "element_id": "2490211a751af08c831f437250d70884", - "text": "Kissi, Northern wanda tu cio M\u025b pil\u0254\u0254 o wol\u0254\u0254 ni, le waa o ba nd\u0254\u0254 cio, o b\u025b\u025blen kenando ni, o t\u0254ngdo ni, b\u025btu n\u0254n yiyando a kullo, o kon ni naan tu dua mim maalyan kalapil\u0254y\u025byi ni.", + "text": "Kissi, Northern wanda tu cio Mɛ pilɔɔ o wolɔɔ ni, le waa o ba ndɔɔ cio, o bɛɛlen kenando ni, o tɔngdo ni, bɛtu nɔn yiyando a kullo, o kon ni naan tu dua mim maalyan kalapilɔyɛyi ni.", "metadata": { "languages": [ "tgl", @@ -5166,7 +5166,7 @@ { "type": "NarrativeText", "element_id": "3da488a598903b0fa6a89a4d9b704219", - "text": "Komi-Permyak \u0411\u044b\u0434\u04e7\u0441 \u043e\u0442\u0438\u0440\u044b\u0441 \u0447\u0443\u0436\u04e7\u043d\u044b \u0432\u043e\u043b\u044c\u043d\u04e7\u0439\u0435\u0437\u04e7\u043d \u0434\u0430 \u04e7\u0442\u043a\u043e\u0434\u0434\u0435\u0437\u04e7\u043d \u0434\u043e\u0441\u0442\u043e\u0438\u043d\u0441\u0442\u0432\u043e\u044b\u043d \u0434\u0430 \u043f\u0440\u0430\u0432\u043e\u044d\u0437\u044b\u043d. \u041d\u044b\u043b\u04e7 \u0441\u0435\u0442\u04e7\u043c \u043c\u044b\u0432\u043a\u044b\u0434 \u0434\u0430 \u0441\u043e\u0432\u0435\u0441\u0442\u044c \u043e\u0432\u043d\u044b \u04e7\u0442\u0430\u043c\u04e7\u0434\u043d\u044b\u0441\u043a\u04e7\u0442 \u043a\u044b\u0434\u0437 \u0432\u043e\u043d\u043d\u044d\u0437\u043b\u04e7.", + "text": "Komi-Permyak Быдӧс отирыс чужӧны вольнӧйезӧн да ӧткоддезӧн достоинствоын да правоэзын. Нылӧ сетӧм мывкыд да совесть овны ӧтамӧдныскӧт кыдз воннэзлӧ.", "metadata": { "languages": [ "rus" @@ -5251,7 +5251,7 @@ { "type": "NarrativeText", "element_id": "71cc3fa5f30f347d8e225e871139661f", - "text": "Korean \ubaa8\ub4e0 \uc778\uac04\uc740 \ud0dc\uc5b4\ub0a0 \ub54c\ubd80\ud130 \uc790\uc720\ub85c\uc6b0\uba70 \uadf8 \uc874\uc5c4\uacfc \uad8c\ub9ac\uc5d0 \uc788\uc5b4 \ub3d9\ub4f1\ud558\ub2e4. \uc778\uac04\uc740 \ucc9c\ubd80\uc801\uc73c\ub85c \uc774\uc131\uacfc \uc591\uc2ec\uc744 \ubd80\uc5ec\ubc1b\uc558\uc73c\uba70 \uc11c\ub85c \ud615\uc81c\uc560\uc758 \uc815\uc2e0\uc73c\ub85c \ud589\ub3d9\ud558\uc5ec\uc57c \ud55c\ub2e4.", + "text": "Korean 모든 인간은 태어날 때부터 자유로우며 그 존엄과 권리에 있어 동등하다. 인간은 천부적으로 이성과 양심을 부여받았으며 서로 형제애의 정신으로 행동하여야 한다.", "metadata": { "languages": [ "kor" @@ -5272,7 +5272,7 @@ { "type": "NarrativeText", "element_id": "ec837c06df9c110c22e734be4704e763", - "text": "Kpelle, Guinea Nukan gele kaa p\u0259l\u0259 kaa tan\u0254n, yili\u0253a nu k\u0259le maawiy\u0259 p\u0259l\u0259 da t\u0254\u0254i gaa \u0272ei y\u025bn\u025byii hu k\u025bp\u0259l\u0259 kaal\u0254 tan\u0254n; di k\u025bm\u025bni a nukan \u014baa \u0253\u0259 g\u025b\u025b hw\u0259k\u025bli w\u025blik\u025bmaa \u0259 l\u0254 di luwai.", + "text": "Kpelle, Guinea Nukan gele kaa pələ kaa tanɔn, yiliɓa nu kəle maawiyə pələ da tɔɔi gaa ɲei yɛnɛyii hu kɛpələ kaalɔ tanɔn; di kɛmɛni a nukan ŋaa ɓə gɛɛ hwəkɛli wɛlikɛmaa ə lɔ di luwai.", "metadata": { "languages": [ "som", @@ -5294,7 +5294,7 @@ { "type": "NarrativeText", "element_id": "6322dea6cfe74f4e5e0272752dccffb4", - "text": "Krio \u025bvrib\u0254di b\u0254n fri \u025bn g\u025bt in yon rayt, n\u0254n wan n\u0254 pas in k\u0254mpin. Wi \u0254l ebul f\u0254 tink \u025bn f\u025bn\u0254t wetin rayt \u025bn r\u0254\u014b pantap dat wi f\u0254 sabi aw f\u0254 liv l\u025bk wan big famili.", + "text": "Krio ɛvribɔdi bɔn fri ɛn gɛt in yon rayt, nɔn wan nɔ pas in kɔmpin. Wi ɔl ebul fɔ tink ɛn fɛnɔt wetin rayt ɛn rɔŋ pantap dat wi fɔ sabi aw fɔ liv lɛk wan big famili.", "metadata": { "languages": [ "ind", @@ -5317,7 +5317,7 @@ { "type": "NarrativeText", "element_id": "e4653071cb4a8a4f59ca7f62a50afbb4", - "text": "Kulango, Bouna Igooyoo p\u025b\u025b h\u028bn taa. B\u0254 p\u025b\u025b jabaga b\u0254r\u0254. H\u0254 ya g\u028b\u028bn\u2019n b\u0254\u0254 h\u025b p\u025b\u025b, h\u0254 h\u025b gus\u025bg\u025b\u2019n.", + "text": "Kulango, Bouna Igooyoo pɛɛ hʋn taa. Bɔ pɛɛ jabaga bɔrɔ. Hɔ ya gʋʋn’n bɔɔ hɛ pɛɛ, hɔ hɛ gusɛgɛ’n.", "metadata": { "languages": [ "tgl", @@ -5339,7 +5339,7 @@ { "type": "NarrativeText", "element_id": "df4b88e2493c88f7b478eaece77dfdb7", - "text": "Kurdish, Central Hem\u00fb mirov azad \u00fb di weqar \u00fb mafan de wekhev t\u00ean dinyay\u00ea. Ew xwed\u00ee hi\u015f \u00fb \u015fu\u00fbr in \u00fb div\u00ea li hember hev bi zihniyeteke bratiy\u00ea bilivin.", + "text": "Kurdish, Central Hemû mirov azad û di weqar û mafan de wekhev tên dinyayê. Ew xwedî hiş û şuûr in û divê li hember hev bi zihniyeteke bratiyê bilivin.", "metadata": { "languages": [ "tur", @@ -5362,7 +5362,7 @@ { "type": "NarrativeText", "element_id": "26a7611f793432bd8ce6f6cb35470ad5", - "text": "Kurdish, Northern Hem\u00fb mirov azad \u00fb di weqar \u00fb mafan de wekhev t\u00ean dinyay\u00ea. Ew xwed\u00ee hi\u015f \u00fb \u015fu\u00fbr in \u00fb div\u00ea li hember hev bi zihniyeteke bratiy\u00ea bilivin.", + "text": "Kurdish, Northern Hemû mirov azad û di weqar û mafan de wekhev tên dinyayê. Ew xwedî hiş û şuûr in û divê li hember hev bi zihniyeteke bratiyê bilivin.", "metadata": { "languages": [ "nld", @@ -5386,7 +5386,7 @@ { "type": "NarrativeText", "element_id": "0eaf9123417f2794584c7cfd20e10aee", - "text": "Ladin D\u00f6tes les porsones nasc l\u00ebdies y cun la medema dignit\u00e9 y i medemi d\u00ebr\u0107. Ares \u00e0 na rajun y na cosci\u00ebnza y m\u00ebss s\u2019incunt\u00e8 \u00f6na cun l\u2019atra te n spirit de fraternit\u00e9.", + "text": "Ladin Dötes les porsones nasc lëdies y cun la medema dignité y i medemi dërć. Ares à na rajun y na cosciënza y mëss s’incuntè öna cun l’atra te n spirit de fraternité.", "metadata": { "languages": [ "spa", @@ -5429,7 +5429,7 @@ { "type": "NarrativeText", "element_id": "5590b8f08d34a13d98afa307c3a0db0a", - "text": "Lamnso' \u00c1 dz\u0259\u0300\u0259\u0301 wir dz\u0259\u0300m r\u00e9\u014br\u00e9\u014b f\u00f3 ghv\u0259m w\u00f9n \u00e0 f\u00f3 gh\u00e0y, \u00e1 yo\u2019 dz\u0259\u0300\u0259\u0301 wir ms\u00f2\u014b ji kw\u00e0n. W\u00ecr dz\u0259\u0300m k\u0300m k f\u00f3mo woo f\u00f3 kw\u00e0\u2019t\u00ec w\u00f9n \u00e0 f\u00f3 vifii, a w\u00f9 k\u00e9r f\u00f3 a yi\u00ec e w\u00f9m\u00f2\u2019 woo w\u00edr moo f\u0259\u0301r v\u0259.", + "text": "Lamnso' Á dzə̀ə́ wir dzə̀m réŋréŋ fó ghvəm wùn à fó ghày, á yo’ dzə̀ə́ wir msòŋ ji kwàn. Wìr dzə̀m k̀m k fómo woo fó kwà’tì wùn à fó vifii, a wù kér fó a yiì e wùmò’ woo wír moo fə́r və.", "metadata": { "languages": [ "vie", @@ -5452,7 +5452,7 @@ { "type": "Title", "element_id": "ae451bf94c5e07470540741833822372", - "text": "Lao \u0ea1\u0eb0\u0e99\u0eb8\u0e94\u0ec0\u0e81\u0eb5\u0e94\u0ea1\u0eb2\u0ea1\u0eb5\u0eaa\u0eb4\u0e94\u0ec0\u0eaa\u0ea5\u0eb5\u0e9e\u0eb2\u0e9a \u0ec1\u0ea5\u0eb0 \u0eaa\u0eb0\u0ec0\u0edd\u0eb5\u0edc\u0ec9\u0eb2\u0e81\u0eb1\u0e99\u0ec3\u0e99\u0e97\u0eb2\u0e87\u0e81\u0ebd\u0e94\u0e95\u0eb4\u0eaa\u0eb1\u0e81 \u0ec1\u0ea5\u0eb0 \u0e97\u0eb2\u0e87\u0eaa\u0eb4\u0e94\u0e94\u0ec9\u0ea7\u0e8d\u0ea1\u0eb0\u0e99\u0eb8\u0e94\u0ea1\u0eb5\u0eaa\u0eb0\u0e95\u0eb4\u0eaa\u0eb3\u0e9b\u0eb1\u0e94\u0e8a\u0eb1\u0e99\u0e8d\u0eb0(\u0eae\u0eb9\u0ec9\u0e94\u0eb5\u0eae\u0eb9\u0ec9\u0e8a\u0ebb\u0ec8\u0ea7)\u0ec1\u0ea5\u0eb0\u0ea1\u0eb5\u0ea1\u0eb0\u0ec2\u0e99\u0e97\u0eb3\u0e88\u0eb7\u0ec8\u0e87\u0e95\u0ec9\u0ead\u0e87\u0e9b\u0eb0\u0e9e\u0eb6\u0e94\u0e95\u0ebb\u0e99\u0e95\u0ecd\u0ec8\u0e81\u0eb1\u0e99\u0ec3\u0e99\u0e97\u0eb2\u0e87\u0e9e\u0eb5\u0ec8\u0e99\u0ec9\u0ead\u0e87.", + "text": "Lao ມະນຸດເກີດມາມີສິດເສລີພາບ ແລະ ສະເໝີໜ້າກັນໃນທາງກຽດຕິສັກ ແລະ ທາງສິດດ້ວຍມະນຸດມີສະຕິສຳປັດຊັນຍະ(ຮູ້ດີຮູ້ຊົ່ວ)ແລະມີມະໂນທຳຈື່ງຕ້ອງປະພຶດຕົນຕໍ່ກັນໃນທາງພີ່ນ້ອງ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -5514,7 +5514,7 @@ { "type": "NarrativeText", "element_id": "6cddab55572e83cd679bab750a745b46", - "text": "Latvian Visi cilv\u0113ki piedzimst br\u012bvi un vienl\u012bdz\u012bgi sav\u0101 pa\u0161cie\u0146\u0101 un ties\u012bb\u0101s. Vi\u0146i ir apvelt\u012bti ar sapr\u0101tu un sirdsapzi\u0146u, un vi\u0146iem j\u0101izturas citam pret citu br\u0101l\u012bbas gar\u0101.", + "text": "Latvian Visi cilvēki piedzimst brīvi un vienlīdzīgi savā pašcieņā un tiesībās. Viņi ir apveltīti ar saprātu un sirdsapziņu, un viņiem jāizturas citam pret citu brālības garā.", "metadata": { "languages": [ "lav" @@ -5535,7 +5535,7 @@ { "type": "NarrativeText", "element_id": "84c7cce831ebebafd545d3767089bc8f", - "text": "Latvian (2) Visi cilv\u0113ki piedzimst br\u012bvi un vienl\u012bdz\u012bgi cie\u0146\u0101 un ties\u012bb\u0101s. Vi\u0146iem ir dots sapr\u0101ts un sirdsapzi\u0146a, un vi\u0146iem citam pret citu j\u0101izturas br\u0101l\u012bbas gar\u0101.", + "text": "Latvian (2) Visi cilvēki piedzimst brīvi un vienlīdzīgi cieņā un tiesībās. Viņiem ir dots saprāts un sirdsapziņa, un viņiem citam pret citu jāizturas brālības garā.", "metadata": { "languages": [ "lav" @@ -5556,7 +5556,7 @@ { "type": "NarrativeText", "element_id": "c431b1dcba75dca04cdeaaa5388f19c0", - "text": "Ligurian Tutte e personn-e nascian libere e p\u00e6ge in dignit\u00e6 e driti. Son dot\u00e6 de raxon e coscensa e gh\u2019an da ag\u00ee l\u2019unn-a verso l\u2019atra inte \u2019n spirito de fradelansa.", + "text": "Ligurian Tutte e personn-e nascian libere e pæge in dignitæ e driti. Son dotæ de raxon e coscensa e gh’an da agî l’unn-a verso l’atra inte ’n spirito de fradelansa.", "metadata": { "languages": [ "ita" @@ -5577,7 +5577,7 @@ { "type": "NarrativeText", "element_id": "693ef7caa32675b109893e37846d9f13", - "text": "Limba, West-Central Biya-m\u025bti fooma be kiyo ka kuyanka\u014b i\u014b kas\u025bmb\u025b m\u025bn\u025b in ka yiki. Bind\u025b ki\u014b ba niy\u0254 in masim\u0254k\u0254, maka yiina wo ka hu w\u025bndi yande.", + "text": "Limba, West-Central Biya-mɛti fooma be kiyo ka kuyankaŋ iŋ kasɛmbɛ mɛnɛ in ka yiki. Bindɛ kiŋ ba niyɔ in masimɔkɔ, maka yiina wo ka hu wɛndi yande.", "metadata": { "languages": [ "swa" @@ -5620,7 +5620,7 @@ { "type": "NarrativeText", "element_id": "6fcb989c6e738221bc467859b15c2d51", - "text": "Lingala (tones) Bato ny\u0254\u0301ns\u0254 na mbo\u0301tama bazali\u0301 ns\u0254\u0301mi\u0301 mpe\u0301 bako\u0301ka\u0301ni\u0301 na lim\u025bmya mpe\u0301 makoki\u0301. Bazali\u0301 na may\u025b\u0301l\u025b mpe\u0301 basenge\u0301li\u0301 kova\u0301nda na bondeko o ka\u0301ti na bango\u0301.", + "text": "Lingala (tones) Bato nyɔ́nsɔ na mbótama bazalí nsɔ́mí mpé bakókání na limɛmya mpé makokí. Bazalí na mayɛ́lɛ mpé basengélí kovánda na bondeko o káti na bangó.", "metadata": { "languages": [ "tgl", @@ -5643,7 +5643,7 @@ { "type": "NarrativeText", "element_id": "353adb6fb432616b715be3966a6d79bd", - "text": "Lithuanian Visi \u017emon\u0117s gimsta laisvi ir lyg\u016bs savo orumu ir teis\u0117mis. Jiems suteiktas protas ir s\u0105\u017ein\u0117 ir jie turi elgtis vienas kito at\u017evilgiu kaip broliai.", + "text": "Lithuanian Visi žmonės gimsta laisvi ir lygūs savo orumu ir teisėmis. Jiems suteiktas protas ir sąžinė ir jie turi elgtis vienas kito atžvilgiu kaip broliai.", "metadata": { "languages": [ "lit" @@ -5664,7 +5664,7 @@ { "type": "NarrativeText", "element_id": "3e4f829a968d5f615b4245e85dc21d08", - "text": "Lobi Teehuu s\u028bn\u0254 n ther \u025b\u025b n\u0269\u0269 b\u028bn\u0254 wa n do deea\u0294 s\u0269 w\u028b n makha sam\u0269n\u0269 na n\u00e0 h\u028b t\u0269n\u025bpar r\u00e0. Thangba ti y\u025br \u00e0 p\u025b y\u025br j\u0269\u0269r n\u00e0 f\u0269lw\u025b s\u0269 a teena waan f\u028bkha omkhaa.", + "text": "Lobi Teehuu sʋnɔ n ther ɛɛ nɩɩ bʋnɔ wa n do deeaʔ sɩ wʋ n makha samɩnɩ na nà hʋ tɩnɛpar rà. Thangba ti yɛr à pɛ yɛr jɩɩr nà fɩlwɛ sɩ a teena waan fʋkha omkhaa.", "metadata": { "languages": [ "som" @@ -5770,7 +5770,7 @@ { "type": "NarrativeText", "element_id": "3f8cca735e9bb8ee68adff123b7ebdda", - "text": "Luxembourgeois All M\u00ebnsch k\u00ebnnt fr\u00e4i a mat deer selwechter Dignit\u00e9it an dene selwechte Rechter op d'Welt. Jiddereen huet s\u00e4i Verstand a s\u00e4i Gew\u00ebsse krut an soll an engem Geescht vu Bridderlechkeet denen anere g\u00e9intiwwer handelen.", + "text": "Luxembourgeois All Mënsch kënnt fräi a mat deer selwechter Dignitéit an dene selwechte Rechter op d'Welt. Jiddereen huet säi Verstand a säi Gewësse krut an soll an engem Geescht vu Bridderlechkeet denen anere géintiwwer handelen.", "metadata": { "languages": [ "nld", @@ -5792,7 +5792,7 @@ { "type": "NarrativeText", "element_id": "1a2cc3d892dc79a4b68cc59db7a69ea1", - "text": "Macedonian \u0421\u0438\u0442\u0435 \u0447\u043e\u0432\u0435\u0447\u043a\u0438 \u0441\u0443\u0448\u0442\u0435\u0441\u0442\u0432\u0430 \u0441\u0435 \u0440\u0430\u0453\u0430\u0430\u0442 \u0441\u043b\u043e\u0431\u043e\u0434\u043d\u0438 \u0438 \u0435\u0434\u043d\u0430\u043a\u0432\u0438 \u043f\u043e \u0434\u043e\u0441\u0442\u043e\u0438\u043d\u0441\u0442\u0432\u043e \u0438 \u043f\u0440\u0430\u0432\u0430. \u0422\u0438\u0435 \u0441\u0435 \u043e\u0431\u0434\u0430\u0440\u0435\u043d\u0438 \u0441\u043e \u0440\u0430\u0437\u0443\u043c \u0438 \u0441\u043e\u0432\u0435\u0441\u0442 \u0438 \u0442\u0440\u0435\u0431\u0430 \u0434\u0430 \u0441\u0435 \u043e\u0434\u043d\u0435\u0441\u0443\u0432\u0430\u0430\u0442 \u0435\u0434\u0435\u043d \u043a\u043e\u043d \u0434\u0440\u0443\u0433 \u0432\u043e \u0434\u0443\u0445\u043e\u0442 \u043d\u0430 \u043e\u043f\u0448\u0442\u043e \u0447\u043e\u0432\u0435\u0447\u043a\u0430\u0442\u0430 \u043f\u0440\u0438\u043f\u0430\u0434\u043d\u043e\u0441\u0442.", + "text": "Macedonian Сите човечки суштества се раѓаат слободни и еднакви по достоинство и права. Тие се обдарени со разум и совест и треба да се однесуваат еден кон друг во духот на општо човечката припадност.", "metadata": { "languages": [ "mkd" @@ -5834,7 +5834,7 @@ { "type": "UncategorizedText", "element_id": "2e4fdb7fcd2748cce07840226331c829", - "text": "Magahi \u0938\u092c \u0932\u094b\u0917 \u0906\u091c\u093e\u0926\u0947 \u091c\u0928\u094d\u092e \u0932\u0947\u092c \u0939\u0908 \u0924\u0925\u093e \u0938\u092c \u0915\u0947 \u092c\u0930\u093e\u092c\u0930\u0947 \u0938\u092e\u094d\u092e\u093e\u0928 \u0914\u0930 \u0905\u0927\u093f\u0915\u093e\u0930 \u0939\u0907\u0964 \u0939\u0941\u0928\u0916\u094b \u0915\u0947 \u092a\u093e\u0938 \u0938\u092e\u091d-\u092c\u0942\u091d \u0914\u0930 \u0905\u0902\u0924:\u0915\u0930\u0923 \u0915\u0947 \u0906\u0935\u093e\u091c \u0939\u094b\u092c \u0939\u0908\u0964 \u0914\u0930 \u0939\u0941\u0928\u0915\u093e \u0926\u094b\u0938\u0930\u094b \u0915\u0947 \u0938\u093e\u0925 \u092d\u093e\u0908\u091a\u093e\u0930\u093e \u0915\u0947 \u0935\u094d\u092f\u0935\u0939\u093e\u0930 \u0915\u0930\u0947 \u092a\u0921\u093c \u0939\u0908\u0964", + "text": "Magahi सब लोग आजादे जन्म लेब हई तथा सब के बराबरे सम्मान और अधिकार हइ। हुनखो के पास समझ-बूझ और अंत:करण के आवाज होब हई। और हुनका दोसरो के साथ भाईचारा के व्यवहार करे पड़ हई।", "metadata": { "languages": [ "hin" @@ -5855,7 +5855,7 @@ { "type": "UncategorizedText", "element_id": "d691df62a8af33ae0b9c152a092e32a9", - "text": "Maithili \u0938\u092d \u092e\u093e\u0928\u0935 \u091c\u0928\u094d\u092e\u0924\u0903 \u0938\u094d\u0935\u0924\u0928\u094d\u0924\u094d\u0930 \u0905\u091b\u093f \u0924\u0925\u093e \u0917\u0930\u093f\u092e\u093e \u0906\u02bc \u0905\u0927\u093f\u0915\u093e\u0930\u092e\u0947 \u0938\u092e\u093e\u0928 \u0905\u091b\u093f\u0964 \u0938\u092d\u0915\u0947\u0901 \u0905\u092a\u0928\u2013\u0905\u092a\u0928 \u092c\u0941\u0926\u094d\u0927\u093f \u0906\u02bc \u0935\u093f\u0935\u0947\u0915 \u091b\u0948\u0915 \u0906\u0913\u0930 \u0938\u092d\u0915\u0947\u0901 \u090f\u0915 \u0926\u094b\u0938\u0930\u093e\u0915 \u092a\u094d\u0930\u0924\u093f \u0938\u094c\u0939\u093e\u0930\u094d\u0926\u092a\u0942\u0930\u094d\u0923 \u0935\u094d\u092f\u0935\u0939\u093e\u0930 \u0915\u0930\u092c\u093e\u0915 \u091a\u093e\u0939\u0940\u0964", + "text": "Maithili सभ मानव जन्मतः स्वतन्त्र अछि तथा गरिमा आʼ अधिकारमे समान अछि। सभकेँ अपन–अपन बुद्धि आʼ विवेक छैक आओर सभकेँ एक दोसराक प्रति सौहार्दपूर्ण व्यवहार करबाक चाही।", "metadata": { "languages": [ "hin", @@ -5877,7 +5877,7 @@ { "type": "NarrativeText", "element_id": "d73cc566475e568433ff76c1fb6af485", - "text": "Makhuwa Atthu othene aniyaria oolikana ni owilamula moota ontthunaya okhala, variyari v\u2019edignidade ni edireito. Akhalanne esaria ni otthokelela, ahaana akhalasaka othene saya vamurettele.", + "text": "Makhuwa Atthu othene aniyaria oolikana ni owilamula moota ontthunaya okhala, variyari v’edignidade ni edireito. Akhalanne esaria ni otthokelela, ahaana akhalasaka othene saya vamurettele.", "metadata": { "languages": [ "swa", @@ -5900,7 +5900,7 @@ { "type": "NarrativeText", "element_id": "166af43c7950017574b550ca090a6ff8", - "text": "Makonde Vanu vohevohe vaidile n\u2019chilambo valendene. Vanijaliwa ulimala vene. Pavele vanu pave na ulongo.", + "text": "Makonde Vanu vohevohe vaidile n’chilambo valendene. Vanijaliwa ulimala vene. Pavele vanu pave na ulongo.", "metadata": { "languages": [ "est", @@ -5945,7 +5945,7 @@ { "type": "NarrativeText", "element_id": "e74053233c7584ace3ddb4357ac894b7", - "text": "Malay (Arabic) \u0633\u0645\u0648\u0627 \u0645\u0623\u0646\u0633\u064a \u062f\u0644\u0627\u0647\u064a\u0631\u0643\u0646 \u0628\u064a\u0628\u0633 \u062f\u0627\u0646 \u0633\u0627\u0645\u0631\u0627\u062a \u062f\u0631\u064a \u0633\u06ac\u064a \u0643\u0645\u0648\u0644\u064a\u0623\u0646 \u062f\u0627\u0646 \u062d\u0642\u0662. \u0645\u0631\u064a\u0643 \u0645\u0645\u06a4\u0648\u06bd\u0627\u064a \u06a4\u0645\u064a\u0643\u064a\u0631\u0646 \u062f\u0627\u0646 \u06a4\u0631\u0627\u0633\u0623\u0646 \u0647\u0627\u062a\u064a \u062f\u0627\u0646 \u0647\u0646\u062f\u0642\u0644\u0647 \u0628\u0631\u062a\u064a\u0646\u062f\u0642 \u062f \u0627\u0646\u062a\u0627\u0631\u0627 \u0633\u0627\u062a\u0648 \u0633\u0627\u0645 \u0644\u0627\u0626\u0646 \u062f\u06a0\u0646 \u0633\u0645\u0627\u06a0\u062a \u06a4\u0631\u0633\u0627\u0648\u062f\u0627\u0631\u0623\u0646.", + "text": "Malay (Arabic) سموا مأنسي دلاهيركن بيبس دان سامرات دري سڬي كموليأن دان حق٢. مريك ممڤوڽاي ڤميكيرن دان ڤراسأن هاتي دان هندقله برتيندق د انتارا ساتو سام لائن دڠن سماڠت ڤرساودارأن.", "metadata": { "languages": [ "ara", @@ -5988,7 +5988,7 @@ { "type": "NarrativeText", "element_id": "563cefb3266bb81ad240fb3d631fb5b0", - "text": "Malayalam \u0d2e\u0d28\u0d41\u0d37\u0d4d\u0d2f\u0d30\u0d46\u0d32\u0d4d\u0d32\u0d3e\u0d35\u0d30\u0d41\u0d02 \u0d24\u0d41\u0d32\u0d4d\u0d2f\u0d3e\u0d35\u0d15\u0d3e\u0d36\u0d19\u0d4d\u0d19\u0d33\u0d4b\u0d1f\u0d41\u0d02 \u0d05\u0d28\u0d4d\u0d24\u0d38\u0d4d\u0d38\u0d4b\u0d1f\u0d41\u0d02 \u0d38\u0d4d\u0d35\u0d3e\u0d24\u0d28\u0d4d\u0d24\u0d4d\u0d30\u0d4d\u0d2f\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d41\u0d02\u0d15\u0d42\u0d1f\u0d3f \u0d1c\u0d28\u0d3f\u0d1a\u0d4d\u0d1a\u0d3f\u0d1f\u0d4d\u0d1f\u0d41\u0d33\u0d4d\u0d33\u0d35\u0d30\u0d3e\u0d23\u0d4d\u200c. \u0d05\u0d28\u0d4d\u0d2f\u0d4b\u0d28\u0d4d\u0d2f\u0d02 \u0d2d\u0d4d\u0d30\u0d3e\u0d24\u0d43\u0d2d\u0d3e\u0d35\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46 \u0d2a\u0d46\u0d30\u0d41\u0d2e\u0d3e\u0d31\u0d41\u0d35\u0d3e\u0d28\u0d3e\u0d23\u0d4d\u200c \u0d2e\u0d28\u0d41\u0d37\u0d4d\u0d2f\u0d28\u0d4d\u0d28\u0d41 \u0d35\u0d3f\u0d35\u0d47\u0d15\u0d2c\u0d41\u0d26\u0d4d\u0d27\u0d3f\u0d2f\u0d41\u0d02 \u0d2e\u0d28\u0d38\u0d4d\u0d38\u0d3e\u0d15\u0d4d\u0d37\u0d3f\u0d2f\u0d41\u0d02 \u0d38\u0d3f\u0d26\u0d4d\u0d27\u0d2e\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d\u200c.", + "text": "Malayalam മനുഷ്യരെല്ലാവരും തുല്യാവകാശങ്ങളോടും അന്തസ്സോടും സ്വാതന്ത്ര്യത്തോടുംകൂടി ജനിച്ചിട്ടുള്ളവരാണ്‌. അന്യോന്യം ഭ്രാതൃഭാവത്തോടെ പെരുമാറുവാനാണ്‌ മനുഷ്യന്നു വിവേകബുദ്ധിയും മനസ്സാക്ഷിയും സിദ്ധമായിരിക്കുന്നത്‌.", "metadata": { "languages": [ "mal" @@ -6009,7 +6009,7 @@ { "type": "NarrativeText", "element_id": "a1c5471ea369ac3ba44f2829262f62aa", - "text": "Malayalam \u0d2e\u0d28\u0d41\u0d37\u0d4d\u0d2f\u0d30\u0d46\u0d32\u0d4d\u0d32\u0d3e\u0d35\u0d30\u0d41\u0d02 \u0d24\u0d41\u0d32\u0d4d\u0d2f\u0d3e\u0d35\u0d15\u0d3e\u0d36\u0d19\u0d4d\u0d19\u0d33\u0d4b\u0d1f\u0d41\u0d02 \u0d05\u0d28\u0d4d\u0d24\u0d38\u0d4d\u0d38\u0d4b\u0d1f\u0d41\u0d02 \u0d38\u0d4d\u0d35\u0d3e\u0d24\u0d28\u0d4d\u0d24\u0d4d\u0d30\u0d4d\u0d2f\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d41\u0d02\u0d15\u0d42\u0d1f\u0d3f \u0d1c\u0d28\u0d3f\u0d1a\u0d4d\u0d1a\u0d3f\u0d1f\u0d4d\u0d1f\u0d41\u0d33\u0d4d\u0d33\u0d35\u0d30\u0d3e\u0d23\u0d4d\u200c. \u0d05\u0d28\u0d4d\u0d2f\u0d4b\u0d28\u0d4d\u0d2f\u0d02 \u0d2d\u0d4d\u0d30\u0d3e\u0d24\u0d43\u0d2d\u0d3e\u0d35\u0d24\u0d4d\u0d24\u0d4b\u0d1f\u0d46 \u0d2a\u0d46\u0d30\u0d41\u0d2e\u0d3e\u0d31\u0d41\u0d35\u0d3e\u0d28\u0d3e\u0d23\u0d4d\u200c \u0d2e\u0d28\u0d41\u0d37\u0d4d\u0d2f\u0d28\u0d4d\u0d28\u0d41 \u0d35\u0d3f\u0d35\u0d47\u0d15\u0d2c\u0d41\u0d26\u0d4d\u0d27\u0d3f\u0d2f\u0d41\u0d02 \u0d2e\u0d28\u0d38\u0d4d\u0d38\u0d3e\u0d15\u0d4d\u0d37\u0d3f\u0d2f\u0d41\u0d02 \u0d38\u0d3f\u0d26\u0d4d\u0d27\u0d2e\u0d3e\u0d2f\u0d3f\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d\u200c.", + "text": "Malayalam മനുഷ്യരെല്ലാവരും തുല്യാവകാശങ്ങളോടും അന്തസ്സോടും സ്വാതന്ത്ര്യത്തോടുംകൂടി ജനിച്ചിട്ടുള്ളവരാണ്‌. അന്യോന്യം ഭ്രാതൃഭാവത്തോടെ പെരുമാറുവാനാണ്‌ മനുഷ്യന്നു വിവേകബുദ്ധിയും മനസ്സാക്ഷിയും സിദ്ധമായിരിക്കുന്നത്‌.", "metadata": { "languages": [ "mal" @@ -6030,7 +6030,7 @@ { "type": "NarrativeText", "element_id": "abe9340337f1806d7c7bb1e55e23819f", - "text": "Maldivian \u0780\u07aa\u0783\u07a8\u0780\u07a7 \u0787\u07a8\u0782\u07b0\u0790\u07a7\u0782\u07aa\u0782\u07b0\u0788\u07ac\u0790\u07b0 \u078b\u07aa\u0782\u07a8\u0794\u07ac\u0787\u07a6\u0781\u07b0 \u0787\u07aa\u078a\u07a6\u0782\u07b0\u0788\u07a6\u0782\u07a9\u060c \u0789\u07a8\u0782\u07a8\u0788\u07a6\u0782\u07b0\u0786\u07a6\u0789\u07aa\u078e\u07a6\u0787\u07a8\u060c \u0780\u07a6\u0789\u07a6\u0780\u07a6\u0789\u07a6 \u0799\u07a6\u0787\u07b0\u07a4\u07aa\u078c\u07a6\u0786\u07a6\u0786\u07a7\u0787\u07ac\u0786\u07aa\u060c \u0780\u07a6\u0789\u07a6\u0780\u07a6\u0789\u07a6 \u078b\u07a6\u0783\u07a6\u0796\u07a6\u0787\u07ac\u0787\u07b0\u078e\u07a6\u0787\u07a8 \u0786\u07a6\u0789\u07ad\u0780\u07a8\u078c\u07ac\u0788\u07a8\u078e\u07ac\u0782\u07b0\u0788\u07a7 \u0784\u07a6\u0787\u07ac\u0787\u07b0\u078e\u07ac \u078e\u07ae\u078c\u07aa\u078e\u07a6\u0787\u07ac\u0788\u07ac. \u0780\u07ac\u0794\u07ae \u0788\u07a8\u0790\u07b0\u0782\u07aa\u0789\u07a7\u0787\u07a8\u060c \u0780\u07ac\u0794\u07ae\u0784\u07aa\u0787\u07b0\u078b\u07a9\u078e\u07ac \u0784\u07a7\u0783\u07aa \u0787\u07ac\u0789\u07a9\u0780\u07aa\u0782\u07b0\u0782\u07a6\u0781\u07b0 \u078d\u07a8\u0784\u07a8\u078e\u07ac\u0782\u07b0\u0788\u07ac\u0787\u07ac\u0788\u07ac. \u0787\u07a6\u078b\u07a8 \u0787\u07ac\u0786\u07a6\u0786\u07aa \u0787\u07a6\u0782\u07ac\u0786\u07a6\u0786\u07a7\u0789\u07ac\u078b\u07aa \u0787\u07ac\u0789\u07a9\u0780\u07aa\u0782\u07b0 \u0789\u07aa\u07a2\u07a7\u0789\u07a6\u078d\u07a7\u078c\u07b0 \u0786\u07aa\u0783\u07a6\u0782\u07b0\u0788\u07a7\u0782\u07a9\u060c \u0787\u07aa\u079a\u07aa\u0787\u07b0\u0788\u07a6\u078c\u07b0\u078c\u07ac\u0783\u07a8\u0786\u07a6\u0789\u07aa\u078e\u07ac \u0783\u07ab\u0799\u07ac\u0787\u07b0\u078e\u07a6\u0787\u07ac\u0788\u07ac.", + "text": "Maldivian ހުރިހާ އިންސާނުންވެސް ދުނިޔެއަށް އުފަންވަނީ، މިނިވަންކަމުގައި، ހަމަހަމަ ޙައްޤުތަކަކާއެކު، ހަމަހަމަ ދަރަޖައެއްގައި ކަމޭހިތެވިގެންވާ ބައެއްގެ ގޮތުގައެވެ. ހެޔޮ ވިސްނުމާއި، ހެޔޮބުއްދީގެ ބާރު އެމީހުންނަށް ލިބިގެންވެއެވެ. އަދި އެކަކު އަނެކަކާމެދު އެމީހުން މުޢާމަލާތް ކުރަންވާނީ، އުޚުއްވަތްތެރިކަމުގެ ރޫޙެއްގައެވެ.", "metadata": { "languages": [ "ara" @@ -6051,7 +6051,7 @@ { "type": "NarrativeText", "element_id": "c3f212c4f2a219b94139b577bd336587", - "text": "Maltese Il-bnedmin kollha jitwieldu \u0127ielsa u ugwali fid-dinjit\u00e0 u d-drittijiet. Huma mog\u0127nija bir-ra\u0121uni u bil-kuxjenza u g\u0127andhom i\u0121ibu ru\u0127hom ma\u2019 xulxin bi spirtu ta\u2019 a\u0127wa.", + "text": "Maltese Il-bnedmin kollha jitwieldu ħielsa u ugwali fid-dinjità u d-drittijiet. Huma mogħnija bir-raġuni u bil-kuxjenza u għandhom iġibu ruħhom ma’ xulxin bi spirtu ta’ aħwa.", "metadata": { "languages": [ "hrv", @@ -6094,7 +6094,7 @@ { "type": "NarrativeText", "element_id": "53014d120e3ef288a2152a64e8cc5fae", - "text": "Maninkakan, Eastern Adamadennu b\u025b\u025b s\u0254d\u0254n\u0272a kakan, h\u0254r\u0254ya d\u0254, fabaden\u0272a d\u0254 ani sariya ta fan d\u0254. Hankili ni s\u0254n\u0254m\u025b ye alu b\u025b\u025b ma, a kakan wo d\u0254 alu ye bakelen\u0272a sila lataaman alu \u0272\u0254\u0254n t\u025b.", + "text": "Maninkakan, Eastern Adamadennu bɛɛ sɔdɔnɲa kakan, hɔrɔya dɔ, fabadenɲa dɔ ani sariya ta fan dɔ. Hankili ni sɔnɔmɛ ye alu bɛɛ ma, a kakan wo dɔ alu ye bakelenɲa sila lataaman alu ɲɔɔn tɛ.", "metadata": { "languages": [ "ind", @@ -6160,7 +6160,7 @@ { "type": "UncategorizedText", "element_id": "dae3f973f6bbdd3401ce4aa3e297b361", - "text": "Mapudungun Kom pu mogence kisuzuam mvlekey, kom cegeygvn, logkogeygvn ka piwkegeygvn, nieygvn kimvn fey mew mvley ta\u00f1i yamniewael ka epu\u00f1pvle kejuwael egvn.", + "text": "Mapudungun Kom pu mogence kisuzuam mvlekey, kom cegeygvn, logkogeygvn ka piwkegeygvn, nieygvn kimvn fey mew mvley tañi yamniewael ka epuñpvle kejuwael egvn.", "metadata": { "languages": [ "ind", @@ -6183,7 +6183,7 @@ { "type": "NarrativeText", "element_id": "ecca335c6a309f063e4df0ad38eecd27", - "text": "Marathi \u0938\u0930\u094d\u0935 \u092e\u093e\u0928\u0935\u0940 \u0935\u094d\u092f\u0915\u094d\u0924\u093f \u091c\u0928\u094d\u092e\u0924\u0903\u091a \u0938\u094d\u0935\u0924\u0902\u0924\u094d\u0930 \u0906\u0939\u0947\u0924 \u0935 \u0924\u094d\u092f\u093e\u0902\u0928\u093e \u0938\u092e\u093e\u0928 \u092a\u094d\u0930\u0924\u093f\u0937\u094d\u0920\u093e \u0935 \u0938\u092e\u093e\u0928 \u0905\u0927\u093f\u0915\u093e\u0930 \u0906\u0939\u0947\u0924. \u0924\u094d\u092f\u093e\u0902\u0928\u093e \u0935\u093f\u091a\u093e\u0930\u0936\u0915\u094d\u0924\u093f \u0935 \u0938\u0926\u0938\u0935\u093f\u0926\u094d\u0935\u0947\u0915\u092c\u0941\u0926\u094d\u0927\u093f \u0932\u093e\u092d\u0932\u0947\u0932\u0940 \u0906\u0939\u0947. \u0935 \u0924\u094d\u092f\u093e\u0902\u0928\u0940 \u090f\u0915\u092e\u0947\u0915\u093e\u0902\u0936\u0940 \u092c\u0902\u0927\u0941\u0924\u094d\u092f\u093e\u091a\u094d\u092f\u093e \u092d\u093e\u0935\u0928\u0947\u0928\u0947 \u0906\u091a\u0930\u0923 \u0915\u0930\u093e\u0935\u0947.", + "text": "Marathi सर्व मानवी व्यक्ति जन्मतःच स्वतंत्र आहेत व त्यांना समान प्रतिष्ठा व समान अधिकार आहेत. त्यांना विचारशक्ति व सदसविद्वेकबुद्धि लाभलेली आहे. व त्यांनी एकमेकांशी बंधुत्याच्या भावनेने आचरण करावे.", "metadata": { "languages": [ "mar" @@ -6225,7 +6225,7 @@ { "type": "NarrativeText", "element_id": "3a69fb7fe5d36459edf30ffa8f0fb0bc", - "text": "Mats\u00e9s Chidon tishaido yec matses abitedimbo b\u00ebdamboec isnanac b\u00ebdambo ictsiash. Chieshnanac icsambo ictsiash. Abitedimbo b\u00ebdamboec tabadac b\u00ebdambo ictsiash. Shubu abents\u00ebcquid\u00ebn tabadac birnboec abitedi tabadac b\u00ebdambo ictsiash - quequin chuipan\u00ebdash nidaid abitedino\u00ebsh cho-choquidon.", + "text": "Matsés Chidon tishaido yec matses abitedimbo bëdamboec isnanac bëdambo ictsiash. Chieshnanac icsambo ictsiash. Abitedimbo bëdamboec tabadac bëdambo ictsiash. Shubu abentsëcquidën tabadac birnboec abitedi tabadac bëdambo ictsiash - quequin chuipanëdash nidaid abitedinoësh cho-choquidon.", "metadata": { "languages": [ "eng", @@ -6247,7 +6247,7 @@ { "type": "NarrativeText", "element_id": "9c3467ac29002d9da69f15b063e13924", - "text": "Maya, Yucat\u00e1n Tul\u00e1akal w\u00edinik ku s\u00edijil j\u00e1alk\u02bcab yetel keet u tsiikul yetel Najmal Sijnalil, beytun xan na\u02bcata\u02bcan sijnalil yetel no\u02bcoja\u02bcanil u tuukulo\u02bc, k\u02bca\u02bcabet u bisikuba bey l\u00e1aktzilil yetel tul\u00e1akal u baatzile\u02bc.", + "text": "Maya, Yucatán Tuláakal wíinik ku síijil jáalkʼab yetel keet u tsiikul yetel Najmal Sijnalil, beytun xan naʼataʼan sijnalil yetel noʼojaʼanil u tuukuloʼ, kʼaʼabet u bisikuba bey láaktzilil yetel tuláakal u baatzileʼ.", "metadata": { "languages": [ "hun", @@ -6271,7 +6271,7 @@ { "type": "UncategorizedText", "element_id": "7947c1a7d2c92cd1fea5311d4d9241ba", - "text": "Mazahua Central Texe yo nte\u0331'e\u0331 chjetrjoji, angezeji ximi xo'oji \u00f1eje k'inchiji, nesta ra ngara na jo'o k'o dyaja e nte\u0331'e\u0331.", + "text": "Mazahua Central Texe yo nte̱'e̱ chjetrjoji, angezeji ximi xo'oji ñeje k'inchiji, nesta ra ngara na jo'o k'o dyaja e nte̱'e̱.", "metadata": { "languages": [ "hrv", @@ -6294,7 +6294,7 @@ { "type": "NarrativeText", "element_id": "ded8e8298bf9edcaae477d35c01be283", - "text": "Mazatec, Ixcatl\u00e1n Nga ndindie xuta ngatsen de\u2019e ko ngondsejen ngatjin-kjua nga xchandinkon nt\u2019a ngondsejen ngatjin kokjin-tokon,kotjinkjua nga takie engajan skuendinkon xkjin.", + "text": "Mazatec, Ixcatlán Nga ndindie xuta ngatsen de’e ko ngondsejen ngatjin-kjua nga xchandinkon nt’a ngondsejen ngatjin kokjin-tokon,kotjinkjua nga takie engajan skuendinkon xkjin.", "metadata": { "languages": [ "sqi", @@ -6337,7 +6337,7 @@ { "type": "NarrativeText", "element_id": "407b0080d05f944ba83f5c3e722bde13", - "text": "Mbundu (009) Mutu uoso uoso a mu vuala ni ufolo ni kutena kumoxi mu kijingu ni mu ubinganu. Mu kilembu kia kubanga ni mu ubanzelu, Atena u\u00ea kubanga ioso kua akua mu muxima ua tululuka mba upange.", + "text": "Mbundu (009) Mutu uoso uoso a mu vuala ni ufolo ni kutena kumoxi mu kijingu ni mu ubinganu. Mu kilembu kia kubanga ni mu ubanzelu, Atena uê kubanga ioso kua akua mu muxima ua tululuka mba upange.", "metadata": { "languages": [ "swa" @@ -6358,7 +6358,7 @@ { "type": "NarrativeText", "element_id": "d76da3518499aeb0e43b4c133556d135", - "text": "Mende Numuvuisia Kp\u025bl\u025b\u025b ta ti le t\u025b y\u025b nduw\u0254 ya hu, tao ti nuvuu yei k\u025b\u025b ti l\u0254nyi maa h\u025bwung\u0254. Kiiya k\u025b\u025b hindaluahu g\u0254\u0254la a y\u025bl\u0254 ti hun. Fale mahoung\u0254 ti ti ny\u0254ny\u0254hu hoi kia ndeegaa.", + "text": "Mende Numuvuisia Kpɛlɛɛ ta ti le tɛ yɛ nduwɔ ya hu, tao ti nuvuu yei kɛɛ ti lɔnyi maa hɛwungɔ. Kiiya kɛɛ hindaluahu gɔɔla a yɛlɔ ti hun. Fale mahoungɔ ti ti nyɔnyɔhu hoi kia ndeegaa.", "metadata": { "languages": [ "swa", @@ -6380,7 +6380,7 @@ { "type": "NarrativeText", "element_id": "ac3c7d9dea662f8ba1dfb383045ce903", - "text": "Micmac Msit mimajulnu\u2019k weskwijinu\u2019ltijik alsumsultijik aqq newte\u2019 tett wkpimte\u2019tmut aqq koqwajo\u2019taqnn wejkul\u2019aqmititl.", + "text": "Micmac Msit mimajulnu’k weskwijinu’ltijik alsumsultijik aqq newte’ tett wkpimte’tmut aqq koqwajo’taqnn wejkul’aqmititl.", "metadata": { "languages": [ "est", @@ -6425,7 +6425,7 @@ { "type": "NarrativeText", "element_id": "208949d3fb140dd9413f78a99feda832", - "text": "M\u00edskito Upla sut ba kulkanka lakara, airaitka nanira bara pri, sin, aikuki, baku takisa. Bamna sins laka bri baku, lukanka bain pri baku aimuihni lakara, pana pana tabaikan kaiasa.", + "text": "Mískito Upla sut ba kulkanka lakara, airaitka nanira bara pri, sin, aikuki, baku takisa. Bamna sins laka bri baku, lukanka bain pri baku aimuihni lakara, pana pana tabaikan kaiasa.", "metadata": { "languages": [ "ind" @@ -6446,7 +6446,7 @@ { "type": "NarrativeText", "element_id": "db840a4da82f82310ee839cd22112f22", - "text": "Mixe, Totontepec Tum akijpxa xa ve\u2019e jayu kye\u2019ex, ve\u2019em ax j\u00f6\u2019n tyukidaakj\u00fcva tijaty m\u00ebkin; ve\u2019empa axj\u00f6\u2019n j\u00e4 jy\u00f6\u00f6jtykin di yaknaxy, jats oy myujaty\u00f6\u00f6\u2019t\u00ebjk di m\u00eb\u00ebt nayjavaj\u00fct.", + "text": "Mixe, Totontepec Tum akijpxa xa ve’e jayu kye’ex, ve’em ax jö’n tyukidaakjüva tijaty mëkin; ve’empa axjö’n jä jyööjtykin di yaknaxy, jats oy myujatyöö’tëjk di mëët nayjavajüt.", "metadata": { "languages": [ "fin" @@ -6467,7 +6467,7 @@ { "type": "NarrativeText", "element_id": "bbe9fa33187b976f4032c34c6ca2fabf", - "text": "Mixtec, Metlat\u00f3noc Taka ma \u00f1ayi nguiakoi \u00f1ayivi \u00f1atu na ja'a tnu'u ja kusa'a ndeva'\u00f1a-i, su'uva kajito va'a\u00f1a-i, yuka ku ja jini\u00f1u'u ja kukototna-i.", + "text": "Mixtec, Metlatónoc Taka ma ñayi nguiakoi ñayivi ñatu na ja'a tnu'u ja kusa'a ndeva'ña-i, su'uva kajito va'aña-i, yuka ku ja jiniñu'u ja kukototna-i.", "metadata": { "languages": [ "hrv", @@ -6489,7 +6489,7 @@ { "type": "NarrativeText", "element_id": "03b6cefe8d16c5c896f974b268a52302", - "text": "Mizo Mi zawng zawng hi zal\u00eana piang kan ni a, zahawmna leh dikna chanvoah intluk tl\u00e2ng vek kan ni. Chhia leh tha hriatna f\u00eem neia siam kan nih avangin kan mihring puite chungah inunauna thinlung kan pu tlat tur a ni.", + "text": "Mizo Mi zawng zawng hi zalêna piang kan ni a, zahawmna leh dikna chanvoah intluk tlâng vek kan ni. Chhia leh tha hriatna fîm neia siam kan nih avangin kan mihring puite chungah inunauna thinlung kan pu tlat tur a ni.", "metadata": { "languages": [ "ind", @@ -6512,7 +6512,7 @@ { "type": "NarrativeText", "element_id": "ec6cdd4d644ddfaafbb05d9216ebbd7c", - "text": "Moba Nifoi kul maal yendu buam po i, k b yudand yen b yiko-nba bi\u025b ja. B m\u0254g maalm g ban yal g \u014ban, g bi\u025b baa bu yen lieb naataann n nin\u014b i.", + "text": "Moba Nifoi kul maal yendu buam po i, k b yudand yen b yiko-nba biɛ ja. B mɔg maalm g ban yal g ŋan, g biɛ baa bu yen lieb naataann n ninŋ i.", "metadata": { "languages": [ "ind", @@ -6535,7 +6535,7 @@ { "type": "UncategorizedText", "element_id": "0d21e19f00c8cb7264e83c01c0f02161", - "text": "Mon \u1019\u105e\u102d\u101f\u103a\u1002\u1019\u1060\u102d\u102f\u105a\u103a \u1021\u102d\u102f\u103f\u102e\u102f\u1010\u1021\u103a\u101d\u103d\u1036 \u1005\u1014\u1030\u101e\u1060\u1038\u1010\u102d\u1010\u103a \u1014\u1030\u1002\u101d\u103a\u1002\u105e\u1034 \u1012\u103e\u103a\u1019\u105e\u102d\u101f\u103a\u101e\u1060\u1038\u1015\u103d\u1038\u1021\u102d\u102f\u1010\u103a\u1010\u102f\u1032 \u1021\u1001\u1031\u102b\u105a\u103a\u1021\u101b\u102c \u1000\u1031\u102f\u102c\u1036 \u101e\u102d\u1000\u1039\u1001\u102c\u1019\u105e\u102d\u101f\u103a\u1010\u1021\u103a \u1010\u102f\u1015\u103a \u101e\u105f\u101f\u103a\u101b\u104b \u1019\u105e\u102d\u101f\u103a\u1010\u1021\u103a\u1002\u103e\u103a \u1014\u103d\u1036\u1000\u1035\u102f\u1013\u101b\u103a\u1005\u105a\u103a\u1001\u103c\u105a\u103a\u1000\u1031\u102f\u102c\u1036 \u101e\u1019\u1039\u1010\u102e\u100a\u102c\u100f\u103a \u1013\u101d\u103a\u1015\u102b\u103a\u1015\u1032\u102b \u1001\u102d\u102f\u101f\u103a\u1015\u101b\u1031\u1036\u1014\u103d\u1036\u1010\u102f\u1032 \u100a\u1038\u1019\u103d\u1032 \u1000\u1031\u102f\u102c\u1036 \u100a\u1038\u1019\u103d\u1032 \u1011\u1031\u1000\u103a\u1000\u1035\u102f \u101e\u1039\u1012\u1038\u1012\u1039\u1002\u1031\u1010\u103a\u1017\u1000\u103a \u1006\u1000\u103a\u1006\u1031\u102c\u1036\u100a\u1038\u101e\u1039\u1000\u1021\u103a \u1014\u1005\u102d\u102f\u1010\u103a\u1013\u102c\u1010\u103a\u1000\u1031\u102c\u1036\u1012\u1031\u1036\u1021\u101b\u1031\u104b", + "text": "Mon မၞိဟ်ဂမၠိုၚ် အိုဿီုတအ်ဝွံ စနူသၠးတိတ် နူဂဝ်ဂၞဴ ဒှ်မၞိဟ်သၠးပွးအိုတ်တုဲ အခေါၚ်အရာ ကေုာံ သိက္ခာမၞိဟ်တအ် တုပ် သၟဟ်ရ။ မၞိဟ်တအ်ဂှ် နွံကဵုဓရ်စၚ်ခြၚ်ကေုာံ သမ္တီညာဏ် ဓဝ်ပါ်ပဲါ ခိုဟ်ပရေံနွံတုဲ ညးမွဲ ကေုာံ ညးမွဲ ထေက်ကဵု သ္ဒးဒ္ဂေတ်ဗက် ဆက်ဆောံညးသ္ကအ် နစိုတ်ဓာတ်ကောံဒေံအရေ။", "metadata": { "filetype": "text/plain", "data_source": { @@ -6553,7 +6553,7 @@ { "type": "NarrativeText", "element_id": "a36553665277971db5d4c68908f99088", - "text": "Mongolian, Halh (Cyrillic) \u0425\u04af\u043d \u0431\u04af\u0440 \u0442\u04e9\u0440\u0436 \u043c\u044d\u043d\u0434\u043b\u044d\u0445\u044d\u0434 \u044d\u0440\u0445 \u0447\u04e9\u043b\u04e9\u04e9\u0442\u044d\u0439, \u0430\u0434\u0438\u043b\u0445\u0430\u043d \u043d\u044d\u0440 \u0442\u04e9\u0440\u0442\u044d\u0439, \u0438\u0436\u0438\u043b \u044d\u0440\u0445\u0442\u044d\u0439 \u0431\u0430\u0439\u0434\u0430\u0433. \u041e\u044e\u0443\u043d \u0443\u0445\u0430\u0430\u043d, \u043d\u0430\u043d\u0434\u0438\u043d \u0447\u0430\u043d\u0430\u0440 \u0437\u0430\u044f\u0430\u0441\u0430\u043d \u0445\u04af\u043d \u0433\u044d\u0433\u0447 \u04e9\u04e9\u0440 \u0445\u043e\u043e\u0440\u043e\u043d\u0434\u043e\u043e \u0430\u0445\u0430\u043d \u0434\u04af\u04af\u0433\u0438\u0439\u043d \u04af\u0437\u044d\u043b \u0441\u0430\u043d\u0430\u0430\u0433\u0430\u0430\u0440 \u0445\u0430\u0440\u044c\u0446\u0430\u0445 \u0443\u0447\u0438\u0440\u0442\u0430\u0439.", + "text": "Mongolian, Halh (Cyrillic) Хүн бүр төрж мэндлэхэд эрх чөлөөтэй, адилхан нэр төртэй, ижил эрхтэй байдаг. Оюун ухаан, нандин чанар заяасан хүн гэгч өөр хоорондоо ахан дүүгийн үзэл санаагаар харьцах учиртай.", "metadata": { "languages": [ "rus" @@ -6595,7 +6595,7 @@ { "type": "UncategorizedText", "element_id": "ffd087e56c47b9405e77d2f08dca7d1e", - "text": "\u182c\u1826\u182e\u1826\u1828 \u182a\u1826\u1837 \u1832\u1825\u1837\u1825\u1835\u1826 \u182e\u1821\u1828\u1833\u1821\u182f\u1821\u182c\u1826 \u1821\u1837\u182c\u1821 \u1834\u1822\u182f\u1825\u182d\u1821\u202f\u1832\u1821\u1822\u1802 \u1820\u1833\u1820\u182f\u1822\u182c\u1820\u1828 \u1828\u1821\u1837\u180e\u1821 \u1832\u1825\u1837\u1825\u202f\u1832\u1821\u1822\u1802 \u1822\u1835\u1822\u182f \u1821\u1837\u182c\u1821\u202f\u1832\u1821\u1822 \u182a\u1820\u1822\u1820\u182d\u1803 \u1823\u1836\u1824\u1828 \u1824\u182c\u1820\u182d\u1820\u1828\u1802 \u1828\u1820\u1828\u1833\u1822\u1828 \u1834\u1822\u1828\u1820\u1837 \u1835\u1820\u1836\u1820\u182d\u1820\u1830\u1820\u1828 \u182c\u1826\u182e\u1826\u1828 \u182c\u1821\u182d\u1834\u1822 \u1825\u182d\u1821\u1837\u180e\u1821 \u182c\u1823\u182d\u1823\u1837\u1823\u1828\u1833\u1823\u180e\u1828 \u1820\u182c\u1820\u1828 \u1833\u1821\u182d\u1826\u1826\u202f\u1822\u1828 \u1826\u1835\u1822\u182f \u1830\u1820\u1828\u1820\u182d\u1820\u202f\u1825\u1820\u1837 \u182c\u1820\u1837\u1822\u1834\u1820\u182c\u1825 \u1824\u1834\u1822\u1837\u202f\u1832\u1820\u1822\u1803", + "text": "ᠬᠦᠮᠦᠨ ᠪᠦᠷ ᠲᠥᠷᠥᠵᠦ ᠮᠡᠨᠳᠡᠯᠡᠬᠦ ᠡᠷᠬᠡ ᠴᠢᠯᠥᠭᠡ ᠲᠡᠢ᠂ ᠠᠳᠠᠯᠢᠬᠠᠨ ᠨᠡᠷ᠎ᠡ ᠲᠥᠷᠥ ᠲᠡᠢ᠂ ᠢᠵᠢᠯ ᠡᠷᠬᠡ ᠲᠡᠢ ᠪᠠᠢᠠᠭ᠃ ᠣᠶᠤᠨ ᠤᠬᠠᠭᠠᠨ᠂ ᠨᠠᠨᠳᠢᠨ ᠴᠢᠨᠠᠷ ᠵᠠᠶᠠᠭᠠᠰᠠᠨ ᠬᠦᠮᠦᠨ ᠬᠡᠭᠴᠢ ᠥᠭᠡᠷ᠎ᠡ ᠬᠣᠭᠣᠷᠣᠨᠳᠣ᠎ᠨ ᠠᠬᠠᠨ ᠳᠡᠭᠦᠦ ᠢᠨ ᠦᠵᠢᠯ ᠰᠠᠨᠠᠭᠠ ᠥᠠᠷ ᠬᠠᠷᠢᠴᠠᠬᠥ ᠤᠴᠢᠷ ᠲᠠᠢ᠃", "metadata": { "filetype": "text/plain", "data_source": { @@ -6613,7 +6613,7 @@ { "type": "NarrativeText", "element_id": "3d0a59b543e077c2f0c391add9b38a89", - "text": "Montenegrin Sva ljudska bi\u0107a ra\u0111aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i savje\u0161\u0107u i jedni prema drugima treba da postupaju u duhu bratstva.", + "text": "Montenegrin Sva ljudska bića rađaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i savješću i jedni prema drugima treba da postupaju u duhu bratstva.", "metadata": { "languages": [ "hrv" @@ -6634,7 +6634,7 @@ { "type": "NarrativeText", "element_id": "86eff2400c116e5d00b9f1b3e17e0d7f", - "text": "M\u00f2or\u00e9 Ninsaalb\u00e3 f\u00e3a s\u00e3 n doge, ned f\u00e3a so a menga, ned pa rogd n yaa yamb ye, neb\u00e3 f\u00e3a zema taab b yel-segd\u0269 la b burk\u0129ndlem w\u025b\u025bnge\u0303. Neb\u00e3 f\u00e3a tara yam la tagsgo, ned f\u00e3a togame n v\u0269\u0269nd ne a to saam-biir p\u028age\u0303.", + "text": "Mòoré Ninsaalbã fãa sã n doge, ned fãa so a menga, ned pa rogd n yaa yamb ye, nebã fãa zema taab b yel-segdɩ la b burkĩndlem wɛɛngẽ. Nebã fãa tara yam la tagsgo, ned fãa togame n vɩɩnd ne a to saam-biir pʊgẽ.", "metadata": { "languages": [ "som", @@ -6657,7 +6657,7 @@ { "type": "NarrativeText", "element_id": "91eb2842523b8e930ee6199a0098fa14", - "text": "Moro Le\u0111a pre\u0111 lal\u01dd\u014b\u01ddnia l\u00ebb\u01ddr\u00ebinialo na l\u01dd\u027d\u01ddwa\u1e6fo e\u014ben \u014b\u01dd\u0111amia na e\u014ben pre\u0111 i\u014bi \u014b\u01ddrca\u0111a\u1e6fo \u1e6fa le\u0111a al\u01ddfi\u0111i. L\u00ebn\u014bulu pre\u0111 lanan\u00ebinu \u0111\u01ddnaca \u0111ame \u027det\u01dd\u027deto na ara g\u01dd\u014b\u01ddra \u014ben\u014ban\u1e6fa al\u01dd\u027d\u01ddwa\u0111a\u1e6fe alam\u01dd\u0111ai\u0111e b\u01dd\u027dan usilaga g\u01dd\u014b\u01ddl\u01dd\u014b\u01ddnia na g\u01dd\u014borba.", + "text": "Moro Leđa pređ lalǝŋǝnia lëbǝrëinialo na lǝɽǝwaṯo eŋen ŋǝđamia na eŋen pređ iŋi ŋǝrcađaṯo ṯa leđa alǝfiđi. Lënŋulu pređ lananëinu đǝnaca đame ɽetǝɽeto na ara gǝŋǝra ŋenŋanṯa alǝɽǝwađaṯe alamǝđaiđe bǝɽan usilaga gǝŋǝlǝŋǝnia na gǝŋorba.", "metadata": { "languages": [ "hrv" @@ -6700,7 +6700,7 @@ { "type": "NarrativeText", "element_id": "25ab4cdce4c3199b55a4bd49864e981b", - "text": "Naga, Ao Meimchir ajak temeten aser tashi kasa n\u00fcji nung asor. Parnok dak bilemtetts\u00fc shisats\u00fc aser tangatetba kasa ag\u00fcja aliba jagi k\u00fclem adianu rongnung tanela ka nung lungjema alits\u00fcla.", + "text": "Naga, Ao Meimchir ajak temeten aser tashi kasa nüji nung asor. Parnok dak bilemtettsü shisatsü aser tangatetba kasa agüja aliba jagi külem adianu rongnung tanela ka nung lungjema alitsüla.", "metadata": { "languages": [ "ind", @@ -6744,7 +6744,7 @@ { "type": "NarrativeText", "element_id": "9376ea8b7100165bb8bd466c00f5bdcc", - "text": "Nanai \u0425\u044d\u043c\u0442\u0443 \u043d\u0430\u0438\u0306\u0441\u0430\u043b \u0433\u0438\u043f\u0430\u043b\u0438\u043d, \u043c\u044d\u043d\u044d \u0433\u044d\u0431\u0443\u0434\u0438\u044d\u0440\u0438, \u043f\u0440\u0430\u0432\u043e\u0441\u0430\u043b\u0434\u0438\u0430\u0440\u0438 \u044d\u043c\u0443\u0442\u0443 \u0431\u0430\u043b\u0434\u0438\u0447\u0438. \u041d\u0435\u0308\u0430\u043d\u0447\u0438 \u043c\u0443\u0440\u0443\u04c8\u043a\u0443, \u0434\u044d\u0440\u044d\u043b\u043a\u0443, \u0434\u0438\u0430 \u0434\u0438\u0430\u0432\u0430\u0440\u0438 \u0430-\u043d\u044d\u0443-\u043c\u044d\u0442 \u0431\u043e\u0434\u043e\u043c\u0430\u0440\u0438 \u0442\u0430\u0433\u0438\u043b\u0430\u0438\u0306\u0447\u0438.", + "text": "Nanai Хэмту найсал гипалин, мэнэ гэбудиэри, правосалдиари эмуту балдичи. Нёанчи муруӈку, дэрэлку, диа диавари а-нэу-мэт бодомари тагилайчи.", "metadata": { "languages": [ "rus" @@ -6765,7 +6765,7 @@ { "type": "NarrativeText", "element_id": "201308d749f47555d03c5087f304457b", - "text": "Navajo Bila\u02bcashda\u02bcii t\u02bc\u00e1\u00e1 a\u0142tsoh yin\u00edk\u02bcehgo bidizhch\u012fh d\u00f3\u00f3 ahee\u0142t\u02bceego \u00edl\u012f\u0301\u012f\u0301go bee baah\u00f3ch\u012f\u02bc. E\u00ed\u00ed h\u00e1n\u00ed\u02bc d\u00f3\u00f3 h\u00e1n\u00edtshakees hwiihdaasya\u02bc e\u00ed\u00ed binahj\u012f\u0301\u02bc ahidin\u00ed\u0142n\u00e1hgo \u00e1l\u00edleek\u02bcehgo k\u02bc\u00e9 bee ahi\u0142 niidl\u012f\u0301.", + "text": "Navajo Bilaʼashdaʼii tʼáá ałtsoh yiníkʼehgo bidizhchįh dóó aheełtʼeego ílį́į́go bee baahóchįʼ. Eíí háníʼ dóó hánítshakees hwiihdaasyaʼ eíí binahjį́ʼ ahidiníłnáhgo álíleekʼehgo kʼé bee ahił niidlį́.", "metadata": { "languages": [ "som", @@ -6831,7 +6831,7 @@ { "type": "NarrativeText", "element_id": "a0cad811bb49185b6fdb66fb2060c59a", - "text": "Nenets \u0415\u0442 \u0445\u0438\u0431\u044f\u0440\u0438 \u043d\u0435\u043d\u044d\u0446\u044c \u0441\u043e\u044f\u043c\u0430\u0440\u0438\u0430\u043d\u0442\u0430 \u0445\u0443\u0440\u043a\u0430\u0440\u0438 \u043f\u0440\u0430\u0432\u0430\u0434\u0430 \u0442\u043d\u044f\u0432\u0430, \u04c8\u043e\u0431\u043e\u0439 \u043d\u0435\u043d\u044d\u0446\u044f \u043d\u0438\u0434\u0443 \u043d\u0438\u0441\u044c \u0442\u043e\u043a\u0430\u043b\u0431\u0430, \u04c8\u044b\u0431\u0442\u0430\u043c\u0431\u0430 \u0438\u043b\u0435\u0432\u0430\u0442\u0443 \u0442\u0430\u0440\u0430.", + "text": "Nenets Ет хибяри ненэць соямарианта хуркари правада тнява, ӈобой ненэця ниду нись токалба, ӈыбтамба илевату тара.", "metadata": { "languages": [ "rus", @@ -6853,7 +6853,7 @@ { "type": "UncategorizedText", "element_id": "80851f8727cbd5baeb6611ada10ff1f9", - "text": "Nepali \u0938\u092c\u0948 \u0935\u094d\u092f\u0915\u094d\u0924\u093f \u0939\u0930\u0942 \u091c\u0928\u094d\u092e\u091c\u093e\u0924 \u0938\u094d\u0935\u0924\u0928\u094d\u0924\u094d\u0930 \u0939\u0941\u0928 \u0924\u0940 \u0938\u092c\u0948\u0915\u094b \u0938\u092e\u093e\u0928 \u0905\u0927\u093f\u0915\u093e\u0930 \u0930 \u092e\u0939\u0924\u094d\u0935 \u091b\u0964 \u0928\u093f\u091c\u0939\u0930\u0942\u092e\u093e \u0935\u093f\u091a\u093e\u0930 \u0936\u0915\u094d\u0924\u093f \u0930 \u0938\u0926\u094d\u0927\u093f\u091a\u093e\u0930 \u092d\u090f\u0915\u094b\u0932\u0947 \u0928\u093f\u091c\u0939\u0930\u0942\u0932\u0947 \u0906\u092a\u0938\u092e\u093e \u092d\u093e\u0924\u0943\u0924\u094d\u0935\u0915\u094b \u092d\u093e\u0935\u0928\u093e \u092c\u093e\u091f \u0935\u094d\u092f\u0935\u0939\u093e\u0930 \u0917\u0930\u094d\u0928\u0941 \u092a\u0930\u094d\u091b\u0964", + "text": "Nepali सबै व्यक्ति हरू जन्मजात स्वतन्त्र हुन ती सबैको समान अधिकार र महत्व छ। निजहरूमा विचार शक्ति र सद्धिचार भएकोले निजहरूले आपसमा भातृत्वको भावना बाट व्यवहार गर्नु पर्छ।", "metadata": { "languages": [ "nep" @@ -6874,7 +6874,7 @@ { "type": "NarrativeText", "element_id": "23ce504c8239c6964f02399ff1fcb1bf", - "text": "Nganasan \u0411\u04d9\u043d\u0434\u0435\u201d \u04c8\u0430\u043d\u0430\u0441\u0430\u043d\u04d9\u201d \u04c8\u04d9\u0442\u0443\u043a\u04d9\u043d\u0434\u044b\u201d \u043d\u0435\u043d\u0434\u044f\u201d\u0442\u0443\u043e\u201d \u04c8\u043e\u043d\u04d9 \u0445\u043e\u043d\u0441\u044b \u0445\u0435\u043b\u0438\u0434\u0435\u201d \u04c8\u0438\u043b\u0435 \u043c\u04d9\u043d\u04d9\u0439 (\u043f\u0440\u0430\u0432\u0430\u0439). \u0421\u044b\u0442\u044b\u04c8 \u0445\u043e\u043d\u0434\u044b\u201d \u04c8\u0438\u043b\u0435 \u04c8\u043e\u043d\u0434\u0430 \u04c8\u043e\u043d\u04d9 \u0441\u044f\u0440\u0443, \u0434\u04af\u0437\u044b\u0442\u04d9\u043d\u0434\u044b\u04c8 \u0438\u0445\u04af\u0442\u04af\u04c8 \u043d\u044f\u0433\u04d9\u04d9\u201d \u0441\u04af\u04e9\u0430\u0440\u0443\u0441\u04d9\u201d.", + "text": "Nganasan Бәнде” ӈанасанә” ӈәтукәнды” нендя”туо” ӈонә хонсы хелиде” ӈиле мәнәй (правай). Сытыӈ хонды” ӈиле ӈонда ӈонә сяру, дүзытәндыӈ ихүтүӈ нягәә” сүөарусә”.", "metadata": { "languages": [ "rus" @@ -6917,7 +6917,7 @@ { "type": "NarrativeText", "element_id": "9164d07351a9366edfae5357e2ab807c", - "text": "Nomatsiguenga Antagaisati matsiguenga ibogaigu\u00eb matsiguengasonorl. Aisati icantaigaca. Teni iromerataiguengani. Antagaisati iquengaigui aisati ig\u00f3iguiro ora caninaro aisati ig\u00f3iguiro ora te onganinate. Iroro caninataque omagaro matsiguenga iraniacaninataigueri ira basiniati matsiguenga aisati ingantaiguer\u00ed ora caninaro.", + "text": "Nomatsiguenga Antagaisati matsiguenga ibogaiguë matsiguengasonorl. Aisati icantaigaca. Teni iromerataiguengani. Antagaisati iquengaigui aisati igóiguiro ora caninaro aisati igóiguiro ora te onganinate. Iroro caninataque omagaro matsiguenga iraniacaninataigueri ira basiniati matsiguenga aisati ingantaiguerí ora caninaro.", "metadata": { "languages": [ "tgl", @@ -6939,7 +6939,7 @@ { "type": "NarrativeText", "element_id": "a2d52f93737464a25abcd5d12c771b98", - "text": "Norwegian, Bokm\u00e5l Alle mennesker er f\u00f8dt frie og med samme menneskeverd og menneskerettigheter. De er utstyrt med fornuft og samvittighet og b\u00f8r handle mot hverandre i brorskapets \u00e5nd.", + "text": "Norwegian, Bokmål Alle mennesker er født frie og med samme menneskeverd og menneskerettigheter. De er utstyrt med fornuft og samvittighet og bør handle mot hverandre i brorskapets ånd.", "metadata": { "languages": [ "nor" @@ -6960,7 +6960,7 @@ { "type": "NarrativeText", "element_id": "0de9dab37169c4ded9b7f75bedf80c7f", - "text": "Norwegian, Nynorsk Alle menneske er f\u00f8dde til fridom og med same menneskeverd og menneskerettar. Dei har f\u00e5tt fornuft og samvit og skal leve med kvarandre som br\u00f8r.", + "text": "Norwegian, Nynorsk Alle menneske er fødde til fridom og med same menneskeverd og menneskerettar. Dei har fått fornuft og samvit og skal leve med kvarandre som brør.", "metadata": { "languages": [ "nor" @@ -7002,7 +7002,7 @@ { "type": "Title", "element_id": "dcfcf466590e9daa75e86df759c90a23", - "text": "\ua2bf\ua0b7\ua0c5\ua13f\ua428\ua425\uff0c\ua305\ua14d\ua002\ua3fd\ua42f\ua488\ua0c5\ua425\ua310\u3002\ua2bf\ua287\ua26a\ua346\ua30b\ua180\ua068\ua24c\ua44c\ua425\uff0c\ua137\ua00b\ua068\ua09b\ua2a8\ua16b\ua0c0\ua0c5\ua425\ua121\ua45f\u3002", + "text": "ꊿꂷꃅꄿꐨꐥ,ꌅꅍꀂꏽꐯꒈꃅꐥꌐ。ꊿꊇꉪꍆꌋꆀꁨꉌꑌꐥ,ꄷꀋꁨꂛꊨꅫꃀꃅꐥꄡꑟ。", "metadata": { "languages": [ "zho" @@ -7023,7 +7023,7 @@ { "type": "NarrativeText", "element_id": "68861af146d56db218a932271da013ea", - "text": "Nyamwezi Banhu bose bubyalagwa biyagalulile, n\u2019ikujo haki zilenganelile.", + "text": "Nyamwezi Banhu bose bubyalagwa biyagalulile, n’ikujo haki zilenganelile.", "metadata": { "languages": [ "swa" @@ -7129,7 +7129,7 @@ { "type": "NarrativeText", "element_id": "8bb5a449ca76c9652411df83a16d36a5", - "text": "Nzema Menli muala di b\u025b ti anwo na eza noko b\u025bs\u025b w\u0254 dibil\u025b nee adenlenyianl\u025b nu. B\u025bl\u025b ndwenlenwo nee adwenle, yem\u0254ti \u0254w\u0254 k\u025b b\u025bkile adiemay\u025bl\u025b b\u025bmaa b\u025b nwo ngoko.", + "text": "Nzema Menli muala di bɛ ti anwo na eza noko bɛsɛ wɔ dibilɛ nee adenlenyianlɛ nu. Bɛlɛ ndwenlenwo nee adwenle, yemɔti ɔwɔ kɛ bɛkile adiemayɛlɛ bɛmaa bɛ nwo ngoko.", "metadata": { "languages": [ "tur", @@ -7153,7 +7153,7 @@ { "type": "NarrativeText", "element_id": "945f5e12a8c939707776f2152604ea76", - "text": "Occitan T\u00f3uti lis uman naisson libre. Soun egau p\u00e8rla digneta e li dre. An t\u00f3uti uno resoun e uno counsci\u00e8nci. Se d\u00e8von tenifreirenau lis un 'm\u00e9 lis autre.", + "text": "Occitan Tóuti lis uman naisson libre. Soun egau pèrla digneta e li dre. An tóuti uno resoun e uno counsciènci. Se dèvon tenifreirenau lis un 'mé lis autre.", "metadata": { "languages": [ "fra", @@ -7175,7 +7175,7 @@ { "type": "NarrativeText", "element_id": "de85ed5a407a19c2c1c89211693d8861", - "text": "Occitan (Auvergnat) Ta la proussouna neisson lieura mo\u00e9 parira p\u00e0 d\u00efness\u00e0 mai dret. Son charjada de razou mo\u00e9 de cousiens\u00e0 mai lhu fau arj\u00ee entreme\u00ee lha bei n'eime de freiress\u00e0.", + "text": "Occitan (Auvergnat) Ta la proussouna neisson lieura moé parira pà dïnessà mai dret. Son charjada de razou moé de cousiensà mai lhu fau arjî entremeî lha bei n'eime de freiressà.", "metadata": { "languages": [ "fra" @@ -7196,7 +7196,7 @@ { "type": "NarrativeText", "element_id": "6260219bc4a42037e7d6f0418b7284c5", - "text": "Occitan (Francoproven\u00e7al, Fribourg) Tot\u00e8 l\u00e8 dzin vinyon ou mondo libro \u00e8 par\u00ea in dinyit\u00e2 \u00e8 in dr\u00ea. Chon dot\u00e2 d\u00e8 r\u00e9jon \u00e8 d\u00e8 konhyinthe \u00e8 d\u00eavon ch\u00e8 konport\u00e2 l\u00e8 j\u2019on-l\u00e8 j\u2019\u00f4tro din on \u00e8chpri d\u00e8 frat\u00e8rnit\u00e2.", + "text": "Occitan (Francoprovençal, Fribourg) Totè lè dzin vinyon ou mondo libro è parê in dinyitâ è in drê. Chon dotâ dè réjon è dè konhyinthe è dêvon chè konportâ lè j’on-lè j’ôtro din on èchpri dè fratèrnitâ.", "metadata": { "languages": [ "ita" @@ -7217,7 +7217,7 @@ { "type": "NarrativeText", "element_id": "b47382b7a0e0afd209aa7e1993565391", - "text": "Occitan (Francoproven\u00e7al, Savoie) Tu luz \u00f2m\u00f2 vinyon u mondo, libr\u00f2, tu t\u00f2ton p\u00e8 le\u00fb dinyit\u00f2 \u00e8 le\u00fb dr\u00e8ye. Y\u2019on tu d\u2019\u00e9m\u00f2 \u00e8 d\u00e8 konhyinhi \u00e8 i d\u00e8von f\u00e8- mouh\u00f2 d\u00e8 frat\u00e8rnit\u00f2 aou\u00e8y luz \u00f2tri.", + "text": "Occitan (Francoprovençal, Savoie) Tu luz òmò vinyon u mondo, librò, tu tòton pè leû dinyitò è leû drèye. Y’on tu d’émò è dè konhyinhi è i dèvon fè- mouhò dè fratèrnitò aouèy luz òtri.", "metadata": { "languages": [ "ita", @@ -7239,7 +7239,7 @@ { "type": "NarrativeText", "element_id": "da6df9434bcea33fdb84c07309f23605", - "text": "Occitan (Francoproven\u00e7al, Valais) Tui l\u00e8 j\u00eatre humain n\u00e9chon libro \u00e8 pary in degnet\u00e2 \u00e9 in drou\u00ea. Chon reijon\u00e2bl\u00f3 \u00e8 d\u00e8 counchieince \u00e8 deivouon \u00e2zic l\u00e8 j\u2019oun vi j\u2019avi di j\u2019\u00e2tr\u00f3 in p\u00e8r oun espri d\u00e8 frat\u00e8rnit\u00e2", + "text": "Occitan (Francoprovençal, Valais) Tui lè jêtre humain néchon libro è pary in degnetâ é in drouê. Chon reijonâbló è dè counchieince è deivouon âzic lè j’oun vi j’avi di j’âtró in pèr oun espri dè fratèrnitâ", "metadata": { "languages": [ "fra", @@ -7261,7 +7261,7 @@ { "type": "NarrativeText", "element_id": "4be88083cf737cac6ec1b39afb2513c5", - "text": "Occitan (Francoproven\u00e7al, Vaud) T\u00ee l\u00e8 z\u2019\u00eetre humain v\u00eegnant \u00e2o mondo libro et par\u00e2i dein la dignit\u00e2 et l\u00e8 dr\u00e2i. L\u2019ant re\u00e7u r\u00e9son et concheince et d\u00e2ivant vivre l\u00e8 z\u2019on avou\u00e9 l\u00e8 z\u2019autro quemet se sant fr\u00e2re et ch\u00e8ra.", + "text": "Occitan (Francoprovençal, Vaud) Tî lè z’ître humain vîgnant âo mondo libro et parâi dein la dignitâ et lè drâi. L’ant reçu réson et concheince et dâivant vivre lè z’on avoué lè z’autro quemet se sant frâre et chèra.", "metadata": { "languages": [ "fra" @@ -7282,7 +7282,7 @@ { "type": "NarrativeText", "element_id": "ca97829bba2e332be352861c0d0e0c70", - "text": "Occitan (Languedocien) Totes los \u00e8ssers umans naisson liures e egals en dignitat e en dreches. Son dotats de rason e de consci\u00e9ncia e se devon comportar los unes amb los autres dins un esperit de fraternitat.", + "text": "Occitan (Languedocien) Totes los èssers umans naisson liures e egals en dignitat e en dreches. Son dotats de rason e de consciéncia e se devon comportar los unes amb los autres dins un esperit de fraternitat.", "metadata": { "languages": [ "cat", @@ -7305,7 +7305,7 @@ { "type": "NarrativeText", "element_id": "2c541386adb644071a67fa19c80d221f", - "text": "Ojibwa, Northwestern \u146d\u1472\u14c7\u140c\u14c0\u14d0 \u1472\u1431\u14aa\u144e\u14ef\u1417\u1466 \u14c2\u1455\u140e\u146d\u1417\u1483 \u144e\u142f\u14c2\u14a5\u144e\u14f1\u140e\u14c2\u1483 \u14a5\u14c7 \u1455\u1431\u1455 \u146d\u148b\u1403\u14c0\u1455\u146f\u14ef\u140e\u14d0 \u1472\u1526 \u144c\u1438\u146b\u1455\u146f\u14ef\u140e\u14d0. \u1405\u1455\u1526\u14c7\u1417 \u14a5\u1472\u140e\u140e\u14d0 \u1472\u1526 \u14c2\u1444\u1472\u140e\u14d0 \u14a5\u14c7\u1417 \u1455\u1525 \u148b\u1403\u1511\u1472\u14c7\u1417\u1438\u144e\u1417\u1438\u14d0 \u140a\u1490\u146f \u14a5\u14c4\u140e\u148b\u140e\u144e\u140e\u14c2\u1483.", + "text": "Ojibwa, Northwestern ᑭᑲᓇᐌᓀᓐ ᑲᐱᒪᑎᓯᐗᑦ ᓂᑕᐎᑭᐗᒃ ᑎᐯᓂᒥᑎᓱᐎᓂᒃ ᒥᓇ ᑕᐱᑕ ᑭᒋᐃᓀᑕᑯᓯᐎᓐ ᑲᔦ ᑌᐸᑫᑕᑯᓯᐎᓐ. ᐅᑕᔦᓇᐗ ᒥᑲᐎᐎᓐ ᑲᔦ ᓂᑄᑲᐎᓐ ᒥᓇᐗ ᑕᔥ ᒋᐃᔑᑲᓇᐗᐸᑎᐗᐸᓐ ᐊᒐᑯ ᒥᓄᐎᒋᐎᑎᐎᓂᒃ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -7347,7 +7347,7 @@ { "type": "NarrativeText", "element_id": "838854e8c37bc2424bd4b8b4324da0a4", - "text": "Orok \u0427\u0438\u043f\u0430\u0304\u043b\u0438 \u0433\u0443\u0440\u0443\u043d\u043d\u0435\u0304 \u0431\u0430\u043b\u04e1\u0438\u0447\u0438 \u0433\u044d\u0432\u0443\u043c\u044d, \u043e\u043c\u043e\u0442\u0442\u043e \u043c\u044d\u0304\u043d\u044d \u043c\u04e9\u0440\u04e9\u043d\u04e1\u0438, \u043c\u044d\u0304\u043d\u044d \u0434\u043e\u0440\u043e\u043d\u04e1\u0438. \u041d\u043e\u0304\u0447\u0438 \u0438\u0434\u044d\u043b\u0443, \u0438\u0440\u043a\u0430\u043b\u0443, \u043c\u044d\u0304\u043d\u044d \u043c\u044d\u0304\u043d\u04e1\u0438 \u043d\u0430\u0304\u0434\u0430\u043a\u0442\u0430\u04c8\u0430\u0447\u0438 \u0431\u0458\u04e3\u0447\u0438.", + "text": "Orok Чипа̄ли гурунне̄ балӡичи гэвумэ, омотто мэ̄нэ мөрөнӡи, мэ̄нэ доронӡи. Но̄чи идэлу, иркалу, мэ̄нэ мэ̄нӡи на̄дактаӈачи бјӣчи.", "metadata": { "languages": [ "rus" @@ -7410,7 +7410,7 @@ { "type": "NarrativeText", "element_id": "61b9c386f4d7f982e217e8a0973deae9", - "text": "Osetin \u0410\u0434\u04d5\u0439\u043c\u04d5\u0433\u0442\u04d5 \u0441\u0435 '\u043f\u043f\u04d5\u0442 \u0434\u04d5\u0440 \u0440\u0430\u0439\u0433\u0443\u044b\u0440\u044b\u043d\u0446 \u0441\u04d5\u0440\u0438\u0431\u0430\u0440\u04d5\u0439 \u04d5\u043c\u04d5 \u04d5\u043c\u0445\u0443\u044b\u0437\u043e\u043d\u04d5\u0439 \u0441\u04d5 \u0431\u0430\u0440\u0442\u044b. \u0423\u044b\u0434\u043e\u043d \u04d5\u0445\u0445\u04d5\u0441\u0442 \u0441\u0442\u044b \u0437\u043e\u043d\u0434 \u04d5\u043c\u04d5 \u043d\u0430\u043c\u044b\u0441\u04d5\u0439, \u04d5\u043c\u04d5 \u043a\u04d5\u0440\u04d5\u0434\u0437\u0438\u0439\u04d5\u043d \u0445\u044a\u0443\u0430\u043c\u04d5 \u0443\u043e\u0439 \u04d5\u0444\u0441\u044b\u043c\u04d5\u0440\u0442\u044b \u0445\u0443\u044b\u0437\u04d5\u043d.", + "text": "Osetin Адӕймӕгтӕ се 'ппӕт дӕр райгуырынц сӕрибарӕй ӕмӕ ӕмхуызонӕй сӕ барты. Уыдон ӕххӕст сты зонд ӕмӕ намысӕй, ӕмӕ кӕрӕдзийӕн хъуамӕ уой ӕфсымӕрты хуызӕн.", "metadata": { "languages": [ "rus" @@ -7431,7 +7431,7 @@ { "type": "NarrativeText", "element_id": "f829c47775b5845587447d35b6b41e40", - "text": "Otomi, Mezquital Gotho nu kja'ni i mu\u0331i ra zoo i gotho ro kuchti, i tu'ni nu ro \u00f1a pad\u00e4 bini i da budi, da mu\u0331i ra zoo koyu gotho yu kja'ni i yo kuadi.", + "text": "Otomi, Mezquital Gotho nu kja'ni i mu̱i ra zoo i gotho ro kuchti, i tu'ni nu ro ña padä bini i da budi, da mu̱i ra zoo koyu gotho yu kja'ni i yo kuadi.", "metadata": { "languages": [ "hrv", @@ -7478,7 +7478,7 @@ { "type": "NarrativeText", "element_id": "dd2ab495e062b9a11fe24355a3c1319e", - "text": "P\u00e1ez Ya'nwe'wewa'te' maa nasapa ha'dacehk hi'pku up'hi', w\u00ebtte u'huwa'hi'pta', eena' eena' f'i'zewa' hi'pta', \u00fcus hi'pta' d'ik'the hi'pta' naapa'kate. Sa' h'ukaysa \u00fcus hi'pcehktha'w sa' pyakhna'we f'i'ze hi'ptha'w.", + "text": "Páez Ya'nwe'wewa'te' maa nasapa ha'dacehk hi'pku up'hi', wëtte u'huwa'hi'pta', eena' eena' f'i'zewa' hi'pta', üus hi'pta' d'ik'the hi'pta' naapa'kate. Sa' h'ukaysa üus hi'pcehktha'w sa' pyakhna'we f'i'ze hi'ptha'w.", "metadata": { "languages": [ "swa" @@ -7544,7 +7544,7 @@ { "type": "UncategorizedText", "element_id": "068d755c0e132506c2d31786a7ed4b32", - "text": "Panjabi, Eastern \u0a38\u0a3e\u0a30\u0a3e \u0a2e\u0a28\u0a41\u0a71\u0a16\u0a40 \u0a2a\u0a30\u0a3f\u0a35\u0a3e\u0a30 \u0a06\u0a2a\u0a23\u0a40 \u0a2e\u0a39\u0a3f\u0a2e\u0a3e, \u0a36\u0a3e\u0a28 \u0a05\u0a24\u0a47 \u0a39\u0a71\u0a15\u0a3e\u0a02 \u0a26\u0a47 \u0a2a\u0a71\u0a16\u0a4b\u0a02 \u0a1c\u0a28\u0a2e \u0a24\u0a4b\u0a02 \u0a39\u0a40 \u0a06\u0a5b\u0a3e\u0a26 \u0a39\u0a48 \u0a05\u0a24\u0a47 \u0a38\u0a41\u0a24\u0a47 \u0a38\u0a3f\u0a71\u0a27 \u0a38\u0a3e\u0a30\u0a47 \u0a32\u0a4b\u0a15 \u0a2c\u0a30\u0a3e\u0a2c\u0a30 \u0a39\u0a28 \u0964 \u0a09\u0a28\u0a4d\u0a39\u0a3e\u0a02 \u0a38\u0a2d\u0a28\u0a3e \u0a28\u0a42\u0a70 \u0a24\u0a30\u0a15 \u0a05\u0a24\u0a47 \u0a5b\u0a2e\u0a40\u0a30 \u0a26\u0a40 \u0a38\u0a4c\u0a17\u0a3e\u0a24 \u0a2e\u0a3f\u0a32\u0a40 \u0a39\u0a4b\u0a08 \u0a39\u0a48 \u0a05\u0a24\u0a47 \u0a09\u0a28\u0a4d\u0a39\u0a3e\u0a02 \u0a28\u0a42\u0a70 \u0a2d\u0a30\u0a3e\u0a24\u0a30\u0a40\u0a2d\u0a3e\u0a35 \u0a26\u0a40 \u0a2d\u0a3e\u0a35\u0a28\u0a3e \u0a30\u0a16\u0a26\u0a3f\u0a06\u0a02 \u0a06\u0a2a\u0a38 \u0a35\u0a3f\u0a1a \u0a35\u0a3f\u0a1a\u0a30\u0a23\u0a3e \u0a1a\u0a3e\u0a39\u0a40\u0a26\u0a3e \u0a39\u0a48 \u0964", + "text": "Panjabi, Eastern ਸਾਰਾ ਮਨੁੱਖੀ ਪਰਿਵਾਰ ਆਪਣੀ ਮਹਿਮਾ, ਸ਼ਾਨ ਅਤੇ ਹੱਕਾਂ ਦੇ ਪੱਖੋਂ ਜਨਮ ਤੋਂ ਹੀ ਆਜ਼ਾਦ ਹੈ ਅਤੇ ਸੁਤੇ ਸਿੱਧ ਸਾਰੇ ਲੋਕ ਬਰਾਬਰ ਹਨ । ਉਨ੍ਹਾਂ ਸਭਨਾ ਨੂੰ ਤਰਕ ਅਤੇ ਜ਼ਮੀਰ ਦੀ ਸੌਗਾਤ ਮਿਲੀ ਹੋਈ ਹੈ ਅਤੇ ਉਨ੍ਹਾਂ ਨੂੰ ਭਰਾਤਰੀਭਾਵ ਦੀ ਭਾਵਨਾ ਰਖਦਿਆਂ ਆਪਸ ਵਿਚ ਵਿਚਰਣਾ ਚਾਹੀਦਾ ਹੈ ।", "metadata": { "languages": [ "pan" @@ -7565,7 +7565,7 @@ { "type": "UncategorizedText", "element_id": "e81229801afdd767a6ca59c9877783bc", - "text": "Panjabi, Western \u0633\u0627\u0631\u06d2 \u0627\u0646\u0633\u0627\u0646 \u0622\u0632\u0627\u062f \u062a\u06d2 \u062d\u0642\u0648\u0642 \u062a\u06d2 \u0639\u0632\u062a \u062f\u06d2 \u0644\u062d\u0627\u0638 \u0646\u0627\u0644 \u0628\u0631\u0627\u0628\u0631 \u067e\u06cc\u062f\u0627 \u06c1\u0648\u0646\u062f\u06d2 \u0646\u06cc\u06ba \u06d4 \u06d4 \u0627\u0648\u06c1 \u0639\u0642\u0644 \u0633\u0645\u062c\u06be \u062a\u06d2 \u0686\u0646\u06af\u06d2 \u0645\u0646\u062f\u06d2 \u062f\u06cc \u067e\u0686\u06be\u0627\u0646 \u062a\u06d2 \u0627\u062d\u0633\u0627\u0633 \u0631\u06a9\u06be\u062f\u06d2 \u0646\u06d2 \u0627\u06cc\u0633 \u0648\u0627\u0633\u0637\u06d2 \u0627\u0648\u06c1\u0646\u0627\u06ba \u0646\u0648\u06ba \u0627\u06a9 \u062f\u0648\u062c\u06d2 \u0646\u0627\u0644 \u0628\u06be\u0627\u0626\u06cc \u0686\u0627\u0631\u06d2 \u0648\u0627\u0644\u0627 \u0633\u0644\u0648\u06a9 \u06a9\u0631\u0646\u0627 \u0686\u0627\u06c1\u06cc \u062f\u0627 \u0627\u06d2 \u06d4 \u06d4", + "text": "Panjabi, Western سارے انسان آزاد تے حقوق تے عزت دے لحاظ نال برابر پیدا ہوندے نیں ۔ ۔ اوہ عقل سمجھ تے چنگے مندے دی پچھان تے احساس رکھدے نے ایس واسطے اوہناں نوں اک دوجے نال بھائی چارے والا سلوک کرنا چاہی دا اے ۔ ۔", "metadata": { "languages": [ "urd" @@ -7607,7 +7607,7 @@ { "type": "UncategorizedText", "element_id": "a2c1dda9330915ecdfba4af7c21da5c0", - "text": "Pashto, Northern \u062f \u0628\u0634\u0631 \u067c\u0648\u0644 \u0627\u0641\u0631\u0627\u062f \u0627\u0632\u0627\u062f \u0646\u0693\u06cd \u062a\u0647 \u0631\u0627\u0681\u064a \u0627\u0648 \u062f \u062d\u064a\u062b\u064a\u062a \u0627\u0648 \u062f \u062d\u0642\u0648\u0642\u0648 \u0644\u0647 \u067e\u0644\u0648\u0647 \u0633\u0631\u0647 \u0628\u0631\u0627\u0628\u0631 \u062f\u064a\u06d4 \u067c\u0648\u0644 \u062f \u0639\u0642\u0644 \u0627\u0648 \u0648\u062c\u062f\u0627\u0646 \u062e\u0627\u0648\u0646\u062f\u0627\u0646 \u062f\u064a \u0627\u0648 \u0628\u0627\u064a\u062f \u064a\u0648 \u0644\u0647 \u0628\u0644 \u0633\u0631\u0647 \u062f \u0648\u0631\u0648\u0631\u06cd \u067e\u0647 \u0631\u0648\u062d\u064a\u0647 \u0633\u0631\u0647 \u0686\u0644\u0646\u0646\u062f \u06a9\u0693\u064a\u06d4", + "text": "Pashto, Northern د بشر ټول افراد ازاد نړۍ ته راځي او د حيثيت او د حقوقو له پلوه سره برابر دي۔ ټول د عقل او وجدان خاوندان دي او بايد يو له بل سره د ورورۍ په روحيه سره چلنند کړي۔", "metadata": { "languages": [ "fas" @@ -7628,7 +7628,7 @@ { "type": "NarrativeText", "element_id": "7e9ad6a402b6252e85be01ffafa1eb5e", - "text": "Picard Tos l\u00e8s-omes vin\u00e8t \u00e5 monde l\u00eebes \u00e8t \u00e9g\u00e5ls po \u00e7ou qu'\u00e8st d' le\u00fb dignit\u00e9 \u00e8t d' le\u00fbs dre\u00fbts. Le\u00fb re\u030azon \u00e8t le\u00fb consyince elz\u00ee fe\u030at on d'vw\u00e9r di s'kid\u00fbre inte di z\u00e8le come d\u00e8s fr\u00e8s", + "text": "Picard Tos lès-omes vinèt å monde lîbes èt égåls po çou qu'èst d' leû dignité èt d' leûs dreûts. Leû re̊zon èt leû consyince elzî fe̊t on d'vwér di s'kidûre inte di zèle come dès frès", "metadata": { "languages": [ "fra" @@ -7757,7 +7757,7 @@ { "type": "NarrativeText", "element_id": "cad1fbc2c59a2ab610912476278d0204", - "text": "Polish Wszyscy ludzie rodz\u0105 si\u0119 wolni i r\u00f3wni pod wzgl\u0119dem swej godno\u015bci i swych praw. S\u0105 oni obdarzeni rozumem i sumieniem i powinni post\u0119powa\u0107 wobec innych w duchu braterstwa.", + "text": "Polish Wszyscy ludzie rodzą się wolni i równi pod względem swej godności i swych praw. Są oni obdarzeni rozumem i sumieniem i powinni postępować wobec innych w duchu braterstwa.", "metadata": { "languages": [ "pol" @@ -7778,7 +7778,7 @@ { "type": "NarrativeText", "element_id": "07022bc1c3bb5010208399375dc1b813", - "text": "Portuguese (Brazil) Todos os seres humanos nascem livres e iguais em dignidade e direitos. S\u00e3o dotados de raz\u00e3o e consci\u00eancia e devem agir em rela\u00e7\u00e3o uns aos outros com esp\u00edrito de fraternidade.", + "text": "Portuguese (Brazil) Todos os seres humanos nascem livres e iguais em dignidade e direitos. São dotados de razão e consciência e devem agir em relação uns aos outros com espírito de fraternidade.", "metadata": { "languages": [ "por" @@ -7799,7 +7799,7 @@ { "type": "NarrativeText", "element_id": "7925a3ec12f3766bebb236e3ec5bdc60", - "text": "Portuguese (Portugal) Todos os seres humanos nascem livres e iguais em dignidade e em direitos. Dotados de raz\u00e3o e de consci\u00eancia, devem agir uns para com os outros em esp\u00edrito de fraternidade.", + "text": "Portuguese (Portugal) Todos os seres humanos nascem livres e iguais em dignidade e em direitos. Dotados de razão e de consciência, devem agir uns para com os outros em espírito de fraternidade.", "metadata": { "languages": [ "por" @@ -7841,7 +7841,7 @@ { "type": "NarrativeText", "element_id": "dc4348bae7eccbd8e30af1763958fee9", - "text": "Pular (Adlam) \ud83a\udd0b\ud83a\udd32\ud83a\udd46\ud83a\udd22\ud83a\udd25\ud83a\udd22 \ud83a\udd22\ud83a\udd44\ud83a\udd23\ud83a\udd2b\ud83a\udd45\ud83a\udd36\ud83a\udd2d \ud83a\udd2c\ud83a\udd2e\ud83a\udd2c \ud83a\udd28\ud83a\udd2e\ud83a\udd3c\ud83a\udd2d\u060c \ud83a\udd32'\ud83a\udd23\ud83a\udd2d\ud83a\udd25\ud83a\udd2f\ud83a\udd2d\ud83a\udd23\ud83a\udd2d \ud83a\udd2b \ud83a\udd36\ud83a\udd2d\ud83a\udd26\ud83a\udd2d\ud83a\udd32\ud83a\udd22\ud83a\udd32\ud83a\udd46\ud83a\udd23\ud83a\udd2b \ud83a\udd3c\ud83a\udd2e \ud83a\udd26\ud83a\udd22\ud83a\udd32\ud83a\udd46\ud83a\udd3a\ud83a\udd2b \ud83a\udd38\ud83a\udd22\ud83a\udd33\ud83a\udd46\ud83a\udd2b\ud83a\udd45\ud83a\udd36\ud83a\udd2d. \ud83a\udd09\ud83a\udd29\ud83a\udd2b \ud83a\udd32'\ud83a\udd3a\ud83a\udd2e\ud83a\udd45\ud83a\udd23\ud83a\udd2d \ud83a\udd25\ud83a\udd2d\ud83a\udd45\ud83a\udd36\ud83a\udd2e \ud83a\udd2b \ud83a\udd38\ud83a\udd22\ud83a\udd33\ud83a\udd46\ud83a\udd2d\ud83a\udd24\ud83a\udd22\ud83a\udd32\ud83a\udd3c\ud83a\udd22\ud83a\udd44\ud83a\udd3a\ud83a\udd22\ud83a\udd24 \ud83a\udd2b\ud83a\udd3c\ud83a\udd2b \ud83a\udd2b\ud83a\udd29\ud83a\udd2b \ud83a\udd28\ud83a\udd2e\ud83a\udd3c\ud83a\udd2d \ud83a\udd38\ud83a\udd35\ud83a\udd45\ud83a\udd2c\ud83a\udd2e \ud83a\udd32'\ud83a\udd23\ud83a\udd2d\ud83a\udd2a\ud83a\udd23\ud83a\udd2b \ud83a\udd2b \ud83a\udd32'\ud83a\udd23\ud83a\udd2b\ud83a\udd2a \ud83a\udd29 \ud83a\udd2d\ud83a\udd34\ud83a\udd32\ud83a\udd3a\ud83a\udd35\ud83a\udd34\ud83a\udd35\ud83a\udd25\ud83a\udd46\ud83a\udd22\ud83a\udd44\ud83a\udd3a\ud83a\udd35.", + "text": "Pular (Adlam) 𞤋𞤲𞥆𞤢𞤥𞤢 𞤢𞥄𞤣𞤫𞥅𞤶𞤭 𞤬𞤮𞤬 𞤨𞤮𞤼𞤭، 𞤲'𞤣𞤭𞤥𞤯𞤭𞤣𞤭 𞤫 𞤶𞤭𞤦𞤭𞤲𞤢𞤲𞥆𞤣𞤫 𞤼𞤮 𞤦𞤢𞤲𞥆𞤺𞤫 𞤸𞤢𞤳𞥆𞤫𞥅𞤶𞤭. 𞤉𞤩𞤫 𞤲'𞤺𞤮𞥅𞤣𞤭 𞤥𞤭𞥅𞤶𞤮 𞤫 𞤸𞤢𞤳𞥆𞤭𞤤𞤢𞤲𞤼𞤢𞥄𞤺𞤢𞤤 𞤫𞤼𞤫 𞤫𞤩𞤫 𞤨𞤮𞤼𞤭 𞤸𞤵𞥅𞤬𞤮 𞤲'𞤣𞤭𞤪𞤣𞤫 𞤫 𞤲'𞤣𞤫𞤪 𞤩 𞤭𞤴𞤲𞤺𞤵𞤴𞤵𞤥𞥆𞤢𞥄𞤺𞤵.", "metadata": { "languages": [ "ara" @@ -7862,7 +7862,7 @@ { "type": "NarrativeText", "element_id": "9c7d0e713be2017eba040780765856df", - "text": "Purepecha Iamendu k'uiripuecha janguarhiparini ka majku jarhati ka jurhimbekuecha jingoni kueraa\u014basondikso ka, juajtakuarhis\u00efndiks\u00ef ambakiti eratsekua ka kaxumbikua, jatsistiks\u00ef eskaks\u00ef sesi arhijperaaka.", + "text": "Purepecha Iamendu k'uiripuecha janguarhiparini ka majku jarhati ka jurhimbekuecha jingoni kueraaŋasondikso ka, juajtakuarhisïndiksï ambakiti eratsekua ka kaxumbikua, jatsistiksï eskaksï sesi arhijperaaka.", "metadata": { "languages": [ "est", @@ -7909,7 +7909,7 @@ { "type": "NarrativeText", "element_id": "e7cb3a61bb828a46ce008b4251df5ef3", - "text": "Quechua, Ambo-Pasco Lapan runa kay pachach'u yurin libri kawananpaq, lapanchinuy iwal respetasha kananpaqmi, mana pipis jarup\u00e4nanpaq, lapanpis iwal yarpach'akuy yach'aqmi, alita mana alita tantiyar kawananpaq. Chaynuy runa masinwan juknin jukninwan kuyanakur kap\u00e4kuchun", + "text": "Quechua, Ambo-Pasco Lapan runa kay pachach'u yurin libri kawananpaq, lapanchinuy iwal respetasha kananpaqmi, mana pipis jarupänanpaq, lapanpis iwal yarpach'akuy yach'aqmi, alita mana alita tantiyar kawananpaq. Chaynuy runa masinwan juknin jukninwan kuyanakur kapäkuchun", "metadata": { "languages": [ "tgl", @@ -7931,7 +7931,7 @@ { "type": "NarrativeText", "element_id": "7af8d8dd7e7418eed6057bb221448506", - "text": "Quechua, Arequipa-La Uni\u00f3n Kanmi derechonchiskuna llapanchispa, nacesqanchismanta. Kantaqmi llapanchispa runa kayninchis. Manan runa kanchu manay derechoyoq. Huk runaq derecho hukpawan kaqllan kan. Kanmi derechonchis llapanchispa allin kawsay libre tiyananchispaq. Llapan runaqpan kan yuyayninchis yachanapaq. Llapanchis kasun llapa runa masinchiskunawan munanakunapaq, huk ayllu hina.", + "text": "Quechua, Arequipa-La Unión Kanmi derechonchiskuna llapanchispa, nacesqanchismanta. Kantaqmi llapanchispa runa kayninchis. Manan runa kanchu manay derechoyoq. Huk runaq derecho hukpawan kaqllan kan. Kanmi derechonchis llapanchispa allin kawsay libre tiyananchispaq. Llapan runaqpan kan yuyayninchis yachanapaq. Llapanchis kasun llapa runa masinchiskunawan munanakunapaq, huk ayllu hina.", "metadata": { "languages": [ "tgl", @@ -8020,7 +8020,7 @@ { "type": "NarrativeText", "element_id": "7838a28da590ff7bb2ea5c7a48ba93fc", - "text": "Quechua, Huamal\u00edes-Dos de Mayo Hu\u00e1nuco Lapan runakunapis yurikuyan librimi y wakinkaqkunanaw rispitashqa, mana jarukushqa kay\u00e4nanpaq. Saynawmi runakunaqa yuriyan shumaq yarpayyuq, alitapis mana alitapis reqiykar y seqay kuyap\u00e4kuyyuq. Saymi runakuna ali kawakuy\u00e4nan jukninwan jukninwanpis.", + "text": "Quechua, Huamalíes-Dos de Mayo Huánuco Lapan runakunapis yurikuyan librimi y wakinkaqkunanaw rispitashqa, mana jarukushqa kayänanpaq. Saynawmi runakunaqa yuriyan shumaq yarpayyuq, alitapis mana alitapis reqiykar y seqay kuyapäkuyyuq. Saymi runakuna ali kawakuyänan jukninwan jukninwanpis.", "metadata": { "languages": [ "swa", @@ -8043,7 +8043,7 @@ { "type": "NarrativeText", "element_id": "08720fc9c770f44e38435bc27b49867d", - "text": "Quechua, Huaylas Ancash Meyqan nunapis manam pipa sirweqnin nuna kananpaqtsu yurikushqa. I nuna karninmi meyqan nunapis juk l\u00e1yatsu kayanman der\u00ebchunkunachowpis. I yarpachakiyta yacharninmi i allita mana allita shonqonkunachow m\u00e1kurninmi nunakuna jukninta wiyanakur kayanman.", + "text": "Quechua, Huaylas Ancash Meyqan nunapis manam pipa sirweqnin nuna kananpaqtsu yurikushqa. I nuna karninmi meyqan nunapis juk láyatsu kayanman derëchunkunachowpis. I yarpachakiyta yacharninmi i allita mana allita shonqonkunachow mákurninmi nunakuna jukninta wiyanakur kayanman.", "metadata": { "languages": [ "ind", @@ -8065,7 +8065,7 @@ { "type": "NarrativeText", "element_id": "34a8df5528e399552e033b89176957b0", - "text": "Quechua, Margos-Yarowilca-Lauricocha Lapantsikunapis Iibrimi yurishqantsi. B\u00e4lintsimi y der\u00ebchuntsikunapis wakinkaqkunanoqlapami. Yarpaynintsikunapis kaykanmi runa mayintsikunawan juk wawqinoq kuyanakur kawap\u00e4kunantsipaq.", + "text": "Quechua, Margos-Yarowilca-Lauricocha Lapantsikunapis Iibrimi yurishqantsi. Bälintsimi y derëchuntsikunapis wakinkaqkunanoqlapami. Yarpaynintsikunapis kaykanmi runa mayintsikunawan juk wawqinoq kuyanakur kawapäkunantsipaq.", "metadata": { "languages": [ "ind", @@ -8112,7 +8112,7 @@ { "type": "NarrativeText", "element_id": "ecc5d074ce9be67e187d19b4aabf87c5", - "text": "Quechua, North Jun\u00edn Lapan runas kay pachachru nasimun juk rantisha runanuy mana pitas sirbinanpaqmi, alipa rikasha kananpaqmi, washasha kananpaqmi. Lapan runakunas nasipaakamun yarpayniyoqmi naatan tantiyayniyoqmi ima lutanta rurapaakurursi tantiyakunanpaq. Lapan runakunas kawapaakunaman juk wawqenuylam.", + "text": "Quechua, North Junín Lapan runas kay pachachru nasimun juk rantisha runanuy mana pitas sirbinanpaqmi, alipa rikasha kananpaqmi, washasha kananpaqmi. Lapan runakunas nasipaakamun yarpayniyoqmi naatan tantiyayniyoqmi ima lutanta rurapaakurursi tantiyakunanpaq. Lapan runakunas kawapaakunaman juk wawqenuylam.", "metadata": { "languages": [ "tgl", @@ -8159,7 +8159,7 @@ { "type": "UncategorizedText", "element_id": "654791ed821f84e420d3742634a53e7c", - "text": "Quechua (Unified Quichua, old Hispanic orthography) Tucuy runacuna quishpirihu\u00e1n hui\u00f1\u00e1n, pactacunahuampes, pay pura, umahu\u00e1n, ayahu\u00e1n chay shucuna shina, chaymantami shuclla shina causangacuna.", + "text": "Quechua (Unified Quichua, old Hispanic orthography) Tucuy runacuna quishpirihuán huiñán, pactacunahuampes, pay pura, umahuán, ayahuán chay shucuna shina, chaymantami shuclla shina causangacuna.", "metadata": { "languages": [ "spa", @@ -8204,7 +8204,7 @@ { "type": "NarrativeText", "element_id": "0f3dc8a63ddcf8d858d8e543a4eb8428", - "text": "Rarotongan Kua anau rangatira ia te tangata katoatoa ma te aiteite i te au tikaanga e te tu ngateitei tiratiratu.\u00a0 Kua ki ia ratou e te mero kimi ravenga e te akavangakau e kia akono tetai i tetai, i roto i te vaerua piri anga taeake.", + "text": "Rarotongan Kua anau rangatira ia te tangata katoatoa ma te aiteite i te au tikaanga e te tu ngateitei tiratiratu.  Kua ki ia ratou e te mero kimi ravenga e te akavangakau e kia akono tetai i tetai, i roto i te vaerua piri anga taeake.", "metadata": { "languages": [ "ind", @@ -8226,7 +8226,7 @@ { "type": "NarrativeText", "element_id": "f0f216272ee0f7e11e21eb4ca1752777", - "text": "Romagnolo Tot j ess\u00e8ri um\u00e8n i n\u00e0s l\u00e9bri e cumpagn in dignit\u00e0 e dir\u00e9t. Lou i \u00e8 dutid ad rasoun e ad cuscinza e i \u00e0 da oper\u00e8, ognun ti cunfrunt at ch'j ilt, sa sentimint ad fratel\u00e8nza.", + "text": "Romagnolo Tot j essèri umèn i nàs lébri e cumpagn in dignità e dirét. Lou i è dutid ad rasoun e ad cuscinza e i à da operè, ognun ti cunfrunt at ch'j ilt, sa sentimint ad fratelènza.", "metadata": { "languages": [ "ita", @@ -8248,7 +8248,7 @@ { "type": "NarrativeText", "element_id": "a84b6ff398b4f815054e7b47107ce163", - "text": "Romani, Balkan Savorre manu\u015ba biand\u00f5n meste thaj barabar k-o demnipen aj k-e hakaja. Si len godi aj somzanipen thaj si len te tr\u0105den pen jekh karing o aver and-o vogi e phralimnasqoro.", + "text": "Romani, Balkan Savorre manuśa biandõn meste thaj barabar k-o demnipen aj k-e hakaja. Si len godi aj somzanipen thaj si len te trąden pen jekh karing o aver and-o vogi e phralimnasqoro.", "metadata": { "languages": [ "slv", @@ -8270,7 +8270,7 @@ { "type": "NarrativeText", "element_id": "dd72113ef6db4b69482adf28078a6090", - "text": "Romani, Balkan (1) Sa e manu\u0161ikane strukture bijand\u017eona tromane thaj jekhutne ko digniteti thaj \u010dapipa. Von si baxtarde em barvale gndaja thaj god\u017eaja thaj trubun jekh avereja te kherjakeren ko vod\u017ei pralipaja.", + "text": "Romani, Balkan (1) Sa e manušikane strukture bijandžona tromane thaj jekhutne ko digniteti thaj čapipa. Von si baxtarde em barvale gndaja thaj godžaja thaj trubun jekh avereja te kherjakeren ko vodži pralipaja.", "metadata": { "languages": [ "slv" @@ -8291,7 +8291,7 @@ { "type": "NarrativeText", "element_id": "d1d78e5ce9c3fe2071093b3f74f8f9b8", - "text": "Romanian (1953) Toate fiin\u021bele umane se nasc libere \u0219i egale \u00een demnitate \u0219i \u00een drepturi. Ele s\u00eent \u00eenzestrate cu ra\u021biune \u0219i con\u0219tiin\u021b\u0103 \u0219i trebuie s\u0103 se comporte unele fa\u021b\u0103 de altele \u00een spiritul fraternit\u0103\u021bii.", + "text": "Romanian (1953) Toate ființele umane se nasc libere și egale în demnitate și în drepturi. Ele sînt înzestrate cu rațiune și conștiință și trebuie să se comporte unele față de altele în spiritul fraternității.", "metadata": { "languages": [ "ron" @@ -8312,7 +8312,7 @@ { "type": "NarrativeText", "element_id": "ffd7f486f85cc12fffdee64c8dc1c47c", - "text": "Romanian (1993) Toate fiin\u021bele umane se nasc libere \u0219i egale \u00een demnitate \u0219i \u00een drepturi. Ele sunt \u00eenzestrate cu ra\u021biune \u0219i con\u0219tiin\u021b\u0103 \u0219i trebuie s\u0103 se comporte unele fa\u021b\u0103 de altele \u00een spiritul fraternit\u0103\u021bii.", + "text": "Romanian (1993) Toate ființele umane se nasc libere și egale în demnitate și în drepturi. Ele sunt înzestrate cu rațiune și conștiință și trebuie să se comporte unele față de altele în spiritul fraternității.", "metadata": { "languages": [ "ron" @@ -8333,7 +8333,7 @@ { "type": "NarrativeText", "element_id": "81db31b50da57a040bad82d9af2297df", - "text": "Romanian (2006) Toate fiin\u021bele umane se nasc libere \u0219i egale \u00een demnitate \u0219i \u00een drepturi. Ele sunt \u00eenzestrate cu ra\u021biune \u0219i con\u0219tiin\u021b\u0103 \u0219i trebuie s\u0103 se comporte unele fa\u021b\u0103 de altele \u00een spiritul fraternit\u0103\u021bii.", + "text": "Romanian (2006) Toate ființele umane se nasc libere și egale în demnitate și în drepturi. Ele sunt înzestrate cu rațiune și conștiință și trebuie să se comporte unele față de altele în spiritul fraternității.", "metadata": { "languages": [ "ron" @@ -8354,7 +8354,7 @@ { "type": "NarrativeText", "element_id": "cadc80db78bd586f5f18217272cfdb17", - "text": "Romansch Tuots umans naschan libers ed eguals in dignit\u00e0 e drets. Els sun dotats cun intellet e conscienza e dessan agir tanter per in uin spiert da fraternit\u00e0.", + "text": "Romansch Tuots umans naschan libers ed eguals in dignità e drets. Els sun dotats cun intellet e conscienza e dessan agir tanter per in uin spiert da fraternità.", "metadata": { "languages": [ "cat", @@ -8376,7 +8376,7 @@ { "type": "NarrativeText", "element_id": "4295c14118d555a1bd3be37701a4578e", - "text": "Romansch (Grischun) Tut ils umans naschan libers ed eguals en dignitad ed en dretgs. Els \u00e8n dotads cun raschun e conscienza e duain agir in vers l\u2019auter en spiert da fraternitad.", + "text": "Romansch (Grischun) Tut ils umans naschan libers ed eguals en dignitad ed en dretgs. Els èn dotads cun raschun e conscienza e duain agir in vers l’auter en spiert da fraternitad.", "metadata": { "languages": [ "deu", @@ -8398,7 +8398,7 @@ { "type": "NarrativeText", "element_id": "d7c3646cc8bf5af91fa007bcdc86ad53", - "text": "Romansch (Puter) Tuot ils umauns naschan libers ed eguels in dignited ed in drets. Els sun dotos cun radschun e conscienza e dessan agir \u00fcn invers l\u2019oter in spiert da fraternited.", + "text": "Romansch (Puter) Tuot ils umauns naschan libers ed eguels in dignited ed in drets. Els sun dotos cun radschun e conscienza e dessan agir ün invers l’oter in spiert da fraternited.", "metadata": { "languages": [ "deu", @@ -8421,7 +8421,7 @@ { "type": "NarrativeText", "element_id": "a0daace15fe9f49d73fcdd9e3b86f001", - "text": "Romansch (Surmiran) Tot igls carstgangs neschan libers ed eguals an dignitad ed an dretgs. Els \u00e8n dotos cun raschung e schientscha e duessan ager l\u2019egn vers l\u2019oter an spiert da fraternitad.", + "text": "Romansch (Surmiran) Tot igls carstgangs neschan libers ed eguals an dignitad ed an dretgs. Els èn dotos cun raschung e schientscha e duessan ager l’egn vers l’oter an spiert da fraternitad.", "metadata": { "languages": [ "cat", @@ -8444,7 +8444,7 @@ { "type": "NarrativeText", "element_id": "57126ecde8022743581d3932507d8b63", - "text": "Romansch (Sursilvan) Tut ils humans neschan libers ed eguals en dignitad ed en dretgs. Els ein dotai cun raschun e cunscienzia e duein agir in viers l\u2019auter en sp\u00e9rt da fraternitad.", + "text": "Romansch (Sursilvan) Tut ils humans neschan libers ed eguals en dignitad ed en dretgs. Els ein dotai cun raschun e cunscienzia e duein agir in viers l’auter en spért da fraternitad.", "metadata": { "languages": [ "deu", @@ -8469,7 +8469,7 @@ { "type": "NarrativeText", "element_id": "82fb166f28096b77e6b865ce44135e16", - "text": "Romansch (Sutsilvan) Tut igls humans neschan libers ad eguals an dignitad ad an dretgs. Els en dotos cun rasch\u00f9n a cunzienzia a den agir egn anviers l\u2019oter an spiert da fraternitad.", + "text": "Romansch (Sutsilvan) Tut igls humans neschan libers ad eguals an dignitad ad an dretgs. Els en dotos cun raschùn a cunzienzia a den agir egn anviers l’oter an spiert da fraternitad.", "metadata": { "languages": [ "cat", @@ -8491,7 +8491,7 @@ { "type": "NarrativeText", "element_id": "53246b60d8dbe52f7f323cfe27507738", - "text": "Romansch (Vallader) Tuot ils umans naschan libers ed eguals in dignit\u00e0 ed in drets. Els sun dotats cun radschun e conscienza e dessan agir \u00fcn invers l\u2019oter in \u00fcn spiert da fraternit\u00e0.", + "text": "Romansch (Vallader) Tuot ils umans naschan libers ed eguals in dignità ed in drets. Els sun dotats cun radschun e conscienza e dessan agir ün invers l’oter in ün spiert da fraternità.", "metadata": { "languages": [ "cat", @@ -8534,7 +8534,7 @@ { "type": "NarrativeText", "element_id": "7b1fe5da3cfa2322dd960a870a966d3a", - "text": "Russian \u0412\u0441\u0435 \u043b\u044e\u0434\u0438 \u0440\u043e\u0436\u0434\u0430\u044e\u0442\u0441\u044f \u0441\u0432\u043e\u0431\u043e\u0434\u043d\u044b\u043c\u0438 \u0438 \u0440\u0430\u0432\u043d\u044b\u043c\u0438 \u0432 \u0441\u0432\u043e\u0435\u043c \u0434\u043e\u0441\u0442\u043e\u0438\u043d\u0441\u0442\u0432\u0435 \u0438 \u043f\u0440\u0430\u0432\u0430\u0445. \u041e\u043d\u0438 \u043d\u0430\u0434\u0435\u043b\u0435\u043d\u044b \u0440\u0430\u0437\u0443\u043c\u043e\u043c \u0438 \u0441\u043e\u0432\u0435\u0441\u0442\u044c\u044e \u0438 \u0434\u043e\u043b\u0436\u043d\u044b \u043f\u043e\u0441\u0442\u0443\u043f\u0430\u0442\u044c \u0432 \u043e\u0442\u043d\u043e\u0448\u0435\u043d\u0438\u0438 \u0434\u0440\u0443\u0433 \u0434\u0440\u0443\u0433\u0430 \u0432 \u0434\u0443\u0445\u0435 \u0431\u0440\u0430\u0442\u0441\u0442\u0432\u0430.", + "text": "Russian Все люди рождаются свободными и равными в своем достоинстве и правах. Они наделены разумом и совестью и должны поступать в отношении друг друга в духе братства.", "metadata": { "languages": [ "rus" @@ -8576,7 +8576,7 @@ { "type": "NarrativeText", "element_id": "48332b010fe58bc794e833308da30575", - "text": "Saami, North Buot olbmot leat rieg\u00e1dan friddjan ja olmmo\u0161\u00e1rvvu ja olmmo\u0161vuoigatvuo\u0111aid d\u00e1fus. Sii leat jierbmala\u0161 olbmot geain lea oamedovdu ja sii g\u00e1lgga\u0161e leat dego vielja\u010dagat.", + "text": "Saami, North Buot olbmot leat riegádan friddjan ja olmmošárvvu ja olmmošvuoigatvuođaid dáfus. Sii leat jierbmalaš olbmot geain lea oamedovdu ja sii gálggaše leat dego vieljačagat.", "metadata": { "languages": [ "est", @@ -8598,7 +8598,7 @@ { "type": "UncategorizedText", "element_id": "373656c2cab80370dd2768316c8a725e", - "text": "Salar Heme kishler h\u00fcr der, haysiyet ma haklarde adil der, mantik ma vicdan var, kardeshlikden davraneshge.", + "text": "Salar Heme kishler hür der, haysiyet ma haklarde adil der, mantik ma vicdan var, kardeshlikden davraneshge.", "metadata": { "languages": [ "tur" @@ -8642,7 +8642,7 @@ { "type": "NarrativeText", "element_id": "ddfa143fc42a89f1e4f7b99ce0028962", - "text": "Sango Ad\u00fc \u00e2zo k\u00fb\u00ea yamba, ng\u00e2 \u00e2la l\u00eengbi ter\u00ea na l\u00eag\u00eb t\u00ee n\u00ebng\u00f6-ter\u00ea na t\u00ee \u00e2ngang\u00fc. Ala k\u00fb\u00ea awara ndar\u00e4 na b\u00f6r\u00f6-li s\u00ef \u00e2la l\u00eengbi t\u00ee dut\u00ef na \u00e2mb\u00e2 t\u00ee \u00e2la g\u00ef na l\u00eang\u00f6 s\u00f6ng\u00f6.", + "text": "Sango Adü âzo kûê yamba, ngâ âla lîngbi terê na lêgë tî nëngö-terê na tî ângangü. Ala kûê awara ndarä na börö-li sï âla lîngbi tî dutï na âmbâ tî âla gï na lêngö söngö.", "metadata": { "languages": [ "tgl", @@ -8664,7 +8664,7 @@ { "type": "UncategorizedText", "element_id": "ba8456690a521bd0fb0bb757c188f302", - "text": "Sanskrit \u0938\u0930\u094d\u0935\u0947 \u092e\u093e\u0928\u0935\u093e\u0903 \u0938\u094d\u0935\u0924\u0928\u094d\u0924\u094d\u0930\u093e\u0903 \u0938\u092e\u0941\u0924\u094d\u092a\u0928\u094d\u0928\u093e\u0903 \u0935\u0930\u094d\u0924\u0928\u094d\u0924\u0947 \u0905\u092a\u093f \u091a, \u0917\u094c\u0930\u0935\u0926\u0943\u0936\u093e \u0905\u0927\u093f\u0915\u093e\u0930\u0926\u0943\u0936\u093e \u091a \u0938\u092e\u093e\u0928\u093e\u0903 \u090f\u0935 \u0935\u0930\u094d\u0924\u0928\u094d\u0924\u0947\u0964 \u090f\u0924\u0947 \u0938\u0930\u094d\u0935\u0947 \u091a\u0947\u0924\u0928\u093e-\u0924\u0930\u094d\u0915-\u0936\u0915\u094d\u0924\u093f\u092d\u094d\u092f\u093e\u0902 \u0938\u0941\u0938\u092e\u094d\u092a\u0928\u094d\u0928\u093e\u0903 \u0938\u0928\u094d\u0924\u093f\u0964 \u0905\u092a\u093f \u091a, \u0938\u0930\u094d\u0935\u0947\u093d\u092a\u093f \u092c\u0928\u094d\u0927\u0941\u0924\u094d\u0935-\u092d\u093e\u0935\u0928\u092f\u093e \u092a\u0930\u0938\u094d\u092a\u0930\u0902 \u0935\u094d\u092f\u0935\u0939\u0930\u0928\u094d\u0924\u0941\u0964", + "text": "Sanskrit सर्वे मानवाः स्वतन्त्राः समुत्पन्नाः वर्तन्ते अपि च, गौरवदृशा अधिकारदृशा च समानाः एव वर्तन्ते। एते सर्वे चेतना-तर्क-शक्तिभ्यां सुसम्पन्नाः सन्ति। अपि च, सर्वेऽपि बन्धुत्व-भावनया परस्परं व्यवहरन्तु।", "metadata": { "languages": [ "hin" @@ -8685,7 +8685,7 @@ { "type": "NarrativeText", "element_id": "7013f596e8a99afdd7965ac753815ad9", - "text": "Sanskrit (Grantha) \ud804\udf38\ud804\udf30\ud804\udf4d\ud804\udf35\ud804\udf47 \ud804\udf2e\ud804\udf3e\ud804\udf28\ud804\udf35\ud804\udf3e\ud804\udf03 \ud804\udf38\ud804\udf4d\ud804\udf35\ud804\udf24\ud804\udf28\ud804\udf4d\ud804\udf24\ud804\udf4d\ud804\udf30\ud804\udf3e\ud804\udf03 \ud804\udf38\ud804\udf2e\ud804\udf41\ud804\udf24\ud804\udf4d\ud804\udf2a\ud804\udf28\ud804\udf4d\ud804\udf28\ud804\udf3e\ud804\udf03 \ud804\udf35\ud804\udf30\ud804\udf4d\ud804\udf24\ud804\udf28\ud804\udf4d\ud804\udf24\ud804\udf47 \ud804\udf05\ud804\udf2a\ud804\udf3f \ud804\udf1a, \ud804\udf17\ud804\udf4c\ud804\udf30\ud804\udf35\ud804\udf26\ud804\udf43\ud804\udf36\ud804\udf3e \ud804\udf05\ud804\udf27\ud804\udf3f\ud804\udf15\ud804\udf3e\ud804\udf30\ud804\udf26\ud804\udf43\ud804\udf36\ud804\udf3e \ud804\udf1a \ud804\udf38\ud804\udf2e\ud804\udf3e\ud804\udf28\ud804\udf3e\ud804\udf03 \ud804\udf0f\ud804\udf35 \ud804\udf35\ud804\udf30\ud804\udf4d\ud804\udf24\ud804\udf28\ud804\udf4d\ud804\udf24\ud804\udf47\u0964 \ud804\udf0f\ud804\udf24\ud804\udf47 \ud804\udf38\ud804\udf30\ud804\udf4d\ud804\udf35\ud804\udf47 \ud804\udf1a\ud804\udf47\ud804\udf24\ud804\udf28\ud804\udf3e-\ud804\udf24\ud804\udf30\ud804\udf4d\ud804\udf15-\ud804\udf36\ud804\udf15\ud804\udf4d\ud804\udf24\ud804\udf3f\ud804\udf2d\ud804\udf4d\ud804\udf2f\ud804\udf3e\ud804\udf02 \ud804\udf38\ud804\udf41\ud804\udf38\ud804\udf2e\ud804\udf4d\ud804\udf2a\ud804\udf28\ud804\udf4d\ud804\udf28\ud804\udf3e\ud804\udf03 \ud804\udf38\ud804\udf28\ud804\udf4d\ud804\udf24\ud804\udf3f\u0964 \ud804\udf05\ud804\udf2a\ud804\udf3f \ud804\udf1a, \ud804\udf38\ud804\udf30\ud804\udf4d\ud804\udf35\ud804\udf47\ud804\udf3d\ud804\udf2a\ud804\udf3f \ud804\udf2c\ud804\udf28\ud804\udf4d\ud804\udf27\ud804\udf41\ud804\udf24\ud804\udf4d\ud804\udf35-\ud804\udf2d\ud804\udf3e\ud804\udf35\ud804\udf28\ud804\udf2f\ud804\udf3e \ud804\udf2a\ud804\udf30\ud804\udf38\ud804\udf4d\ud804\udf2a\ud804\udf30\ud804\udf02 \ud804\udf35\ud804\udf4d\ud804\udf2f\ud804\udf35\ud804\udf39\ud804\udf30\ud804\udf28\ud804\udf4d\ud804\udf24\ud804\udf41\u0964", + "text": "Sanskrit (Grantha) 𑌸𑌰𑍍𑌵𑍇 𑌮𑌾𑌨𑌵𑌾𑌃 𑌸𑍍𑌵𑌤𑌨𑍍𑌤𑍍𑌰𑌾𑌃 𑌸𑌮𑍁𑌤𑍍𑌪𑌨𑍍𑌨𑌾𑌃 𑌵𑌰𑍍𑌤𑌨𑍍𑌤𑍇 𑌅𑌪𑌿 𑌚, 𑌗𑍌𑌰𑌵𑌦𑍃𑌶𑌾 𑌅𑌧𑌿𑌕𑌾𑌰𑌦𑍃𑌶𑌾 𑌚 𑌸𑌮𑌾𑌨𑌾𑌃 𑌏𑌵 𑌵𑌰𑍍𑌤𑌨𑍍𑌤𑍇। 𑌏𑌤𑍇 𑌸𑌰𑍍𑌵𑍇 𑌚𑍇𑌤𑌨𑌾-𑌤𑌰𑍍𑌕-𑌶𑌕𑍍𑌤𑌿𑌭𑍍𑌯𑌾𑌂 𑌸𑍁𑌸𑌮𑍍𑌪𑌨𑍍𑌨𑌾𑌃 𑌸𑌨𑍍𑌤𑌿। 𑌅𑌪𑌿 𑌚, 𑌸𑌰𑍍𑌵𑍇𑌽𑌪𑌿 𑌬𑌨𑍍𑌧𑍁𑌤𑍍𑌵-𑌭𑌾𑌵𑌨𑌯𑌾 𑌪𑌰𑌸𑍍𑌪𑌰𑌂 𑌵𑍍𑌯𑌵𑌹𑌰𑌨𑍍𑌤𑍁।", "metadata": { "languages": [ "nep" @@ -8706,7 +8706,7 @@ { "type": "NarrativeText", "element_id": "d9dd825f97644f9be308505d418e9ea9", - "text": "S\u00e3otomense Tudu ngu\u00ea di mundu ca nanc\u00ea livli e igual ni dignidade e ni dir\u00eatu. Punda nen ca pens\u00e1 e nen t\u00ea cunxensa, sel\u00e1 nen f\u00e9 tudu cu\u00e1 cu ten\u00e7\u00f3n de lum\u00f3n.", + "text": "Sãotomense Tudu nguê di mundu ca nancê livli e igual ni dignidade e ni dirêtu. Punda nen ca pensá e nen tê cunxensa, selá nen fé tudu cuá cu tençón de lumón.", "metadata": { "languages": [ "por", @@ -8728,7 +8728,7 @@ { "type": "NarrativeText", "element_id": "ea94e46fedb24cbbc337bb5d30608ead", - "text": "Sardinian, Logudorese Totu sos \u00e8sseres umanos naschint l\u00ecberos e eguales in dinnidade e in deretos. Issos tenent sa resone e sa cuss\u00e8ntzia e depent operare s'unu cun s'\u00e0teru cun isp\u00ecritu de fraternidade.", + "text": "Sardinian, Logudorese Totu sos èsseres umanos naschint lìberos e eguales in dinnidade e in deretos. Issos tenent sa resone e sa cussèntzia e depent operare s'unu cun s'àteru cun ispìritu de fraternidade.", "metadata": { "languages": [ "cat", @@ -8750,7 +8750,7 @@ { "type": "NarrativeText", "element_id": "135f949e79e915feb11563f40072624d", - "text": "Saxon, Low All de Minschen s\u00fcnd frie un gliek an W\u00fc\u00fcrd un Rechten baren. Se hebbt Vernunft un een Geweten un se sch\u00fcllt sik Br\u00f6der sien.", + "text": "Saxon, Low All de Minschen sünd frie un gliek an Wüürd un Rechten baren. Se hebbt Vernunft un een Geweten un se schüllt sik Bröder sien.", "metadata": { "languages": [ "deu" @@ -8792,7 +8792,7 @@ { "type": "NarrativeText", "element_id": "49685f2659217462214b13c3594d1423", - "text": "Secoya Si'apai aide'oy\u00eb kua'ye peoye kui'ne siay\u00eb'k\u00eb maka pa'iye kui'ne tutupaye koni, jaje kuasase's\u00ebtepi kuaju'i'ne peoye \u00f1ese saiye pa'iji ko\u0331kaij\u00eb yek\u00eb paireje.", + "text": "Secoya Si'apai aide'oyë kua'ye peoye kui'ne siayë'kë maka pa'iye kui'ne tutupaye koni, jaje kuasase'sëtepi kuaju'i'ne peoye ñese saiye pa'iji ko̱kaijë yekë paireje.", "metadata": { "languages": [ "sqi", @@ -8814,7 +8814,7 @@ { "type": "UncategorizedText", "element_id": "e0ca8f739a2a274e0e30bcd509b308e2", - "text": "Seraiki \u0633\u0627\u0631\u06d2 \u0627\u0646\u0633\u0627\u0646 \u0627\u0632\u0627\u062f\u0627 \u062a\u06d2 \u062d\u0642\u0648\u0642 \u062a\u06d2 \u0639\u0632\u062a \u062f\u06d2 \u0627\u0639\u062a\u0628\u0627\u0631 \u0646\u0627\u0644 \u06c1\u06a9\u0648 \u0684\u0626\u06d2 \u067e\u06cc\u062f\u0627 \u062a\u06be\u06cc\u0646\u062f\u0646 \u06d4 \u0642\u062f\u0631\u062a \u0648\u0644\u0648\u06ba \u0627\u0646\u06c1\u0627\u06ba \u06a9\u0648\u06ba \u0639\u0642\u0644 \u062a\u06d2 \u0633\u0645\u062c\u06be \u0639\u0637\u0627 \u062a\u06be\u06cc\u0646\u062f\u06cc \u0627\u06d2 \u06d4 \u06c1\u06cc\u06ba \u06a9\u06cc\u062a\u06d2 \u06c1\u06a9 \u068b\u0648\u062c\u06be\u06d2 \u0646\u0627\u0644 \u0628\u06be\u0631\u067e\u06cc \u062f\u0627\u0633\u0644\u0648\u06a9 \u06a9\u0631\u06bb\u0627 \u0686\u0627\u06c1\u06cc \u062f\u0627 \u0627\u06d2 \u06d4", + "text": "Seraiki سارے انسان ازادا تے حقوق تے عزت دے اعتبار نال ہکو ڄئے پیدا تھیندن ۔ قدرت ولوں انہاں کوں عقل تے سمجھ عطا تھیندی اے ۔ ہیں کیتے ہک ڋوجھے نال بھرپی داسلوک کرڻا چاہی دا اے ۔", "metadata": { "languages": [ "urd" @@ -8835,7 +8835,7 @@ { "type": "NarrativeText", "element_id": "f855b701f2717951ee7041f505936e9e", - "text": "Serbian (Cyrillic) \u0421\u0432\u0430 \u0459\u0443\u0434\u0441\u043a\u0430 \u0431\u0438\u045b\u0430 \u0440\u0430\u0452\u0430\u0458\u0443 \u0441\u0435 \u0441\u043b\u043e\u0431\u043e\u0434\u043d\u0430 \u0438 \u0458\u0435\u0434\u043d\u0430\u043a\u0430 \u0443 \u0434\u043e\u0441\u0442\u043e\u0458\u0430\u043d\u0441\u0442\u0432\u0443 \u0438 \u043f\u0440\u0430\u0432\u0438\u043c\u0430. \u041e\u043d\u0430 \u0441\u0443 \u043e\u0431\u0434\u0430\u0440\u0435\u043d\u0430 \u0440\u0430\u0437\u0443\u043c\u043e\u043c \u0438 \u0441\u0432\u0435\u0448\u045b\u0443 \u0438 \u0442\u0440\u0435\u0431\u0430 \u0458\u0435\u0434\u043d\u0438 \u043f\u0440\u0435\u043c\u0430 \u0434\u0440\u0443\u0433\u0438\u043c\u0430 \u0434\u0430 \u043f\u043e\u0441\u0442\u0443\u043f\u0430\u0458\u0443 \u0443 \u0434\u0443\u0445\u0443 \u0431\u0440\u0430\u0442\u0441\u0442\u0432\u0430.", + "text": "Serbian (Cyrillic) Сва људска бића рађају се слободна и једнака у достојанству и правима. Она су обдарена разумом и свешћу и треба једни према другима да поступају у духу братства.", "metadata": { "languages": [ "mkd" @@ -8856,7 +8856,7 @@ { "type": "NarrativeText", "element_id": "1e1d32ffc1c937e2dc9b3b4e6b8a1453", - "text": "Serbian (Latin) Sva ljudska bi\u0107a ra\u0111aju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i sve\u0161\u0107u i treba jedni prema drugima da postupaju u duhu bratstva.", + "text": "Serbian (Latin) Sva ljudska bića rađaju se slobodna i jednaka u dostojanstvu i pravima. Ona su obdarena razumom i svešću i treba jedni prema drugima da postupaju u duhu bratstva.", "metadata": { "languages": [ "hrv" @@ -8877,7 +8877,7 @@ { "type": "NarrativeText", "element_id": "6a973a162a71cdf61973afc03d10bb08", - "text": "Serer-Sine Wiin we naa \u00f1oowaa na adna, den fop mbodu no ke war na oxnu refna na den a jega o ngalaat umpi yiif um, le mbarin o me\u01adtootaa baa mbaag o \u00f1oow den fop no fog.", + "text": "Serer-Sine Wiin we naa ñoowaa na adna, den fop mbodu no ke war na oxnu refna na den a jega o ngalaat umpi yiif um, le mbarin o meƭtootaa baa mbaag o ñoow den fop no fog.", "metadata": { "languages": [ "som", @@ -8900,7 +8900,7 @@ { "type": "NarrativeText", "element_id": "201296ccbaf34300a62d4a087915bf84", - "text": "Seselwa Creole French Nou tou imen nou\u2019n ne dan laliberte ek legalite, dan nou dignite ek nou bann drwa. Nou tou nou annan kapasite pou rezonnen, e fodre nou azir anver lezot avek en lespri fraternel.", + "text": "Seselwa Creole French Nou tou imen nou’n ne dan laliberte ek legalite, dan nou dignite ek nou bann drwa. Nou tou nou annan kapasite pou rezonnen, e fodre nou azir anver lezot avek en lespri fraternel.", "metadata": { "languages": [ "fra" @@ -8921,7 +8921,7 @@ { "type": "UncategorizedText", "element_id": "f602d39c8cf6ba79e59adce09af30f26", - "text": "Shan \u1075\u1030\u107c\u103a\u1038\u1075\u1030\u108a\u1075\u1031\u1083\u1089\u107c\u1086\u1089 \u1015\u1035\u107c\u103a\u1022\u107c\u103a\u1075\u102d\u1030\u1010\u103a\u1087\u1019\u1083\u1038\u101c\u1030\u107a\u103a\u1088\u1075\u102f\u1004\u103a\u1087\u1019\u102f\u107c\u103a\u1022\u107c\u103a\u101c\u103d\u1010\u103a\u1088\u101c\u1085\u101d\u103a\u1038\u107d\u1035\u1004\u103a\u1087\u1015\u1035\u1004\u103a\u1038\u1075\u107c\u103a \u101c\u1084\u1088 \u101e\u102f\u107c\u103a\u1087\u101c\u1086\u1088\u1022\u107c\u103a \u101c\u103d\u1010\u103a\u1088\u101c\u1085\u101d\u103a\u1038\u107d\u1035\u1004\u103a\u1087 \u1015\u1035\u1004\u103a\u1038\u1075\u107c\u103a\u104b \u1076\u101d\u103a\u107c\u1086\u1089 \u1019\u102e\u1038\u107a\u1062\u107c\u103a\u1087\u1022\u107c\u103a\u1019\u1031\u1083\u1011\u1010\u103a\u1038\u101e\u1062\u1004\u103a \u101c\u1084\u1088 \u1078\u1082\u103a\u1022\u107c\u103a\u1081\u1030\u1089\u1078\u1075\u103a\u1038\u107e\u102d\u1004\u103a\u1088\u1010\u102d\u102f\u101d\u103a\u1038\u1075\u1019\u103a \u107c\u107c\u103a\u1089\u101c\u1084\u1088 \u1011\u102f\u1075\u103a\u1087\u101d\u1086\u1089\u1078\u1082\u103a\u1015\u102e\u1088\u1022\u103d\u1075\u103a\u1087 \u107c\u103d\u1004\u103a\u1089\u1076\u1086\u1087\u1075\u107c\u103a\u101e\u1031 \u1010\u102d\u1010\u103a\u1038\u1010\u1031\u1083\u1087\u1075\u107c\u103a\u104b", + "text": "Shan ၵူၼ်းၵူႊၵေႃႉၼႆႉ ပဵၼ်ဢၼ်ၵိူတ်ႇမႃးလူၺ်ႈၵုင်ႇမုၼ်ဢၼ်လွတ်ႈလႅဝ်းၽဵင်ႇပဵင်းၵၼ် လႄႈ သုၼ်ႇလႆႈဢၼ် လွတ်ႈလႅဝ်းၽဵင်ႇ ပဵင်းၵၼ်။ ၶဝ်ၼႆႉ မီးၺၢၼ်ႇဢၼ်မေႃထတ်းသၢင် လႄႈ ၸႂ်ဢၼ်ႁူႉၸၵ်းၾိင်ႈတိုဝ်းၵမ် ၼၼ်ႉလႄႈ ထုၵ်ႇဝႆႉၸႂ်ပီႈဢွၵ်ႇ ၼွင်ႉၶႆႇၵၼ်သေ တိတ်းတေႃႇၵၼ်။", "metadata": { "filetype": "text/plain", "data_source": { @@ -8962,7 +8962,7 @@ { "type": "NarrativeText", "element_id": "20e37b3914fade183f3e76b200daccbd", - "text": "Shilluk Dhanh\u00f8 b\u00ebne ba anyw\u00f8l\u00f8 e path ki b\u00e4ng, ge p\u00e4r ki yij b\u00eb\u00ebd\u00f8 geki dy\u00ebr\u00f8. g\u00efn-a dwaddi kiper gen y\u00ef gen da rumi ki b\u00eb\u00ebd\u00f8 m\u00f8 g\u00f6\u00f6g gen ki py\u00ebw akyel ga nyim\u00ebgg.", + "text": "Shilluk Dhanhø bëne ba anywølø e path ki bäng, ge pär ki yij bëëdø geki dyërø. gïn-a dwaddi kiper gen yï gen da rumi ki bëëdø mø göög gen ki pyëw akyel ga nyimëgg.", "metadata": { "languages": [ "ind", @@ -8986,7 +8986,7 @@ { "type": "NarrativeText", "element_id": "9be888269d99ba5b9d4200b2a6d65346", - "text": "Shipibo-Conibo Jat\u00edbi joninra huetsa jonibaon yoiai ninc\u00e1resti iqui, jahueraquibi jaconmai iamaquin; jainoash jahuen queena jacon jahu\u00e9quibo ati jahuequescamabi iqui, tsonbira amayamatima iqui. Jaticashbira jascara aresti jacon shinanya iti jahuequescamabi iqui, jahuequescarainoash picota joni inonbi. Huestiora huestiorabora jahu\u00e9qui ati shinanya iqui; jainshon onanribique jahueratoqui jacon iqui jainoash jaconma iqui ishon. Ja copira huetsa jonibires inonbi non jato jaconharesti iqui, non huetsabi non acai quescaaquin.", + "text": "Shipibo-Conibo Jatíbi joninra huetsa jonibaon yoiai nincáresti iqui, jahueraquibi jaconmai iamaquin; jainoash jahuen queena jacon jahuéquibo ati jahuequescamabi iqui, tsonbira amayamatima iqui. Jaticashbira jascara aresti jacon shinanya iti jahuequescamabi iqui, jahuequescarainoash picota joni inonbi. Huestiora huestiorabora jahuéqui ati shinanya iqui; jainshon onanribique jahueratoqui jacon iqui jainoash jaconma iqui ishon. Ja copira huetsa jonibires inonbi non jato jaconharesti iqui, non huetsabi non acai quescaaquin.", "metadata": { "languages": [ "cat", @@ -9030,7 +9030,7 @@ { "type": "NarrativeText", "element_id": "98765accca3aa276e32acc6ddb665f01", - "text": "Shor \u041f\u0430\u0440\u0447\u044b\u043d \u043a\u0438\u0436\u0438, \u043f\u043e \u0447\u0430\u0440\u044b\u049b\u049b\u0430 \u0442\u0443\u0493\u0447\u0430\u0434\u044b\u043f, \u0442\u0435\u04a3, \u043f\u043e\u0448 \u0442\u0443\u0493\u0447\u0430. \u041a\u0438\u0436\u0438\u043b\u0435\u0440 \u0441\u0430\u0493\u044b\u0448\u0442\u044b\u0493, \u0430\u049b\u0442\u044b\u0493 \u0442\u0443\u0493\u0447\u0430\u043b\u0430\u0440, \u043a\u0438\u0436\u0438\u043b\u0435\u0440\u0433\u0435 \u043f\u0430\u0448\u049b\u0430 \u043a\u0438\u0436\u0438\u043b\u0435\u0440\u0431\u0435 \u0430\u0440\u0493\u044b\u0448\u0442\u0430\u043d\u044b\u0448\u0442\u0430\u0440\u0493\u0430 \u043a\u0435\u0440\u0435\u043a.", + "text": "Shor Парчын кижи, по чарыққа туғчадып, тең, пош туғча. Кижилер сағыштығ, ақтығ туғчалар, кижилерге пашқа кижилербе арғыштаныштарға керек.", "metadata": { "languages": [ "rus" @@ -9051,7 +9051,7 @@ { "type": "NarrativeText", "element_id": "06b44e2713d2ab9cbfdbffecc788465a", - "text": "Shuar Aents yaj\u00e1 nunkanam ak\u00ednia asamtaish, metekrak ainiaji. Tumasha ni chichamenka tuke amiktin a\u00edniawai. Ni iniakmamuri, n\u00ed chichamejaituke aniakmamsar chichakartin a\u00edniawai. Tuma asamtai aents mash nekawar, penker metekrak, nuamtak war\u00e1 warat shiir pujusarmi tusar a\u00e1rma awai.", + "text": "Shuar Aents yajá nunkanam akínia asamtaish, metekrak ainiaji. Tumasha ni chichamenka tuke amiktin aíniawai. Ni iniakmamuri, ní chichamejaituke aniakmamsar chichakartin aíniawai. Tuma asamtai aents mash nekawar, penker metekrak, nuamtak wará warat shiir pujusarmi tusar aárma awai.", "metadata": { "languages": [ "ind", @@ -9073,7 +9073,7 @@ { "type": "NarrativeText", "element_id": "8e0cb1b65226a998ba0e2831e44dbe49", - "text": "Sidamo Manchi beetti kalaqamunni wolaphinoho. Ayirrinyunninna qoossotennino taaloho. Huwatanno tiiano kalaqamunni ba\u2019raarinoha ikkasinni mittu wolu ledo rodiimmate ayyaaninni hee\u2019ra noosi.", + "text": "Sidamo Manchi beetti kalaqamunni wolaphinoho. Ayirrinyunninna qoossotennino taaloho. Huwatanno tiiano kalaqamunni ba’raarinoha ikkasinni mittu wolu ledo rodiimmate ayyaaninni hee’ra noosi.", "metadata": { "languages": [ "fin", @@ -9096,7 +9096,7 @@ { "type": "NarrativeText", "element_id": "1129172b2baa1c40a3ab800d0d28f02b", - "text": "Sinhala \u0dc3\u0dd2\u0dba\u0dbd\u0dd4 \u0db8\u0db1\u0dd4\u0dc2\u0dca\u200d\u0dba\u0dba\u0ddd \u0db1\u0dd2\u0daf\u0dc4\u0dc3\u0dca\u0dc0 \u0d8b\u0db4\u0dad \u0dbd\u0db6\u0dcf \u0d87\u0dad. \u0d9c\u0dbb\u0dd4\u0dad\u0dca\u0dc0\u0dba\u0dd9\u0db1\u0dca \u0dc4\u0dcf \u0d85\u0dba\u0dd2\u0dad\u0dd2\u0dc0\u0dcf\u0dc3\u0dd2\u0d9a\u0db8\u0dca\u0dc0\u0dbd\u0dd2\u0db1\u0dca \u0dc3\u0db8\u0dcf\u0db1 \u0dc0\u0dd9\u0dad\u0dd2. \u0dba\u0dd4\u0d9a\u0dca\u0dad\u0dd2 \u0d85\u0dba\u0dd4\u0d9a\u0dca\u0dad\u0dd2 \u0db4\u0dd2\u0dc5\u0dd2\u0db6\u0db3 \u0dc4\u0dd0\u0d9f\u0dd3\u0db8\u0dd9\u0db1\u0dca \u0dc4\u0dcf \u0dc4\u0dd8\u0daf\u0dba \u0dc3\u0dcf\u0d9a\u0dca\u0dc2\u0dd2\u0dba\u0dd9\u0db1\u0dca \u0dba\u0dd4\u0dad\u0dca \u0d94\u0dc0\u0dd4\u0db1\u0dca, \u0d94\u0dc0\u0dd4\u0db1\u0ddc\u0dc0\u0dd4\u0db1\u0dca\u0da7 \u0dc3\u0dd0\u0dc5\u0d9a\u0dd2\u0dba \u0dba\u0dd4\u0dad\u0dca\u0dad\u0dda \u0dc3\u0dc4\u0ddd\u0daf\u0dbb\u0dad\u0dca\u0dc0\u0dba \u0db4\u0dd2\u0dc5\u0dd2\u0db6\u0db3 \u0dc4\u0dd0\u0d9f\u0dd3\u0db8\u0dd9\u0db1\u0dd2.", + "text": "Sinhala සියලු මනුෂ්‍යයෝ නිදහස්ව උපත ලබා ඇත. ගරුත්වයෙන් හා අයිතිවාසිකම්වලින් සමාන වෙති. යුක්ති අයුක්ති පිළිබඳ හැඟීමෙන් හා හෘදය සාක්ෂියෙන් යුත් ඔවුන්, ඔවුනොවුන්ට සැළකිය යුත්තේ සහෝදරත්වය පිළිබඳ හැඟීමෙනි.", "metadata": { "filetype": "text/plain", "data_source": { @@ -9114,7 +9114,7 @@ { "type": "NarrativeText", "element_id": "7f18ad35feab9b6f20b97d87856143c8", - "text": "Siona Sia'bai\u0331 aideo'y\u00eb goa'ye beoye gu\u0331i'ne sia'y\u00eb'qu\u00eb maca bai'ye gu\u0331i'ne qu\u00ebco baye co\u0331ni, ja\u0331je\u0331 goachase's\u00ebte goa'ju\u0331i'\u00f1e beoye \u00f1ese saiye bai'ji co\u0331caij\u00eb yequ\u00eb bai\u0331reje.", + "text": "Siona Sia'bai̱ aideo'yë goa'ye beoye gu̱i'ne sia'yë'quë maca bai'ye gu̱i'ne quëco baye co̱ni, ja̱je̱ goachase'sëte goa'ju̱i'ñe beoye ñese saiye bai'ji co̱caijë yequë bai̱reje.", "metadata": { "languages": [ "sqi", @@ -9137,7 +9137,7 @@ { "type": "NarrativeText", "element_id": "c82f4633a9724d1de7dfe866d1429080", - "text": "Slovak V\u0161etci \u013eudia sa rodia slobodn\u00ed a sebe rovn\u00ed , \u010do sa t\u00fdka ich dostojnosti a pr\u00e1v. S\u00fa obdaren\u00ed rozumom a maj\u00fa navz\u00e1jom jedna\u0165 v bratskom duchu.", + "text": "Slovak Všetci ľudia sa rodia slobodní a sebe rovní , čo sa týka ich dostojnosti a práv. Sú obdarení rozumom a majú navzájom jednať v bratskom duchu.", "metadata": { "languages": [ "slk" @@ -9200,7 +9200,7 @@ { "type": "NarrativeText", "element_id": "5d86d8cbc9dda45558ccf60a3974e66a", - "text": "Soninke Haadama renme su saareyen \u014ba an na du-kitten \u00f1a, an nta sere komaaxu, an do soron su yan yekka dorontaaxu do taqu. Haqilen, wa sere su, a do soro kuttu nan siri terene doome kappalengaaxu kanma.", + "text": "Soninke Haadama renme su saareyen ŋa an na du-kitten ña, an nta sere komaaxu, an do soron su yan yekka dorontaaxu do taqu. Haqilen, wa sere su, a do soro kuttu nan siri terene doome kappalengaaxu kanma.", "metadata": { "languages": [ "som", @@ -9223,7 +9223,7 @@ { "type": "NarrativeText", "element_id": "2254a39b8eef4c825a973c26eb9364c9", - "text": "Sorbian, Upper W\u0161itcy \u010d\u0142owjekojo su wot naroda swobodni a su jenacy po dostojnos\u0107i a prawach. Woni su z rozumom a sw\u011bdomjom wobdarjeni a maja mjezsobu w duchu bratrowstwa wobchad\u017ae\u0107.", + "text": "Sorbian, Upper Wšitcy čłowjekojo su wot naroda swobodni a su jenacy po dostojnosći a prawach. Woni su z rozumom a swědomjom wobdarjeni a maja mjezsobu w duchu bratrowstwa wobchadźeć.", "metadata": { "languages": [ "pol", @@ -9245,7 +9245,7 @@ { "type": "NarrativeText", "element_id": "f6b37545577a2f9471636b40acbc5bf3", - "text": "Sotho, Northern Batho ka moka ba belegwe ba lokologile le gona ba na le seriti sa go lekana le ditokelo. Ba filwe monagano le letswalo mme ba swanet\u0161e go swarana ka moya wa bana ba mpa.", + "text": "Sotho, Northern Batho ka moka ba belegwe ba lokologile le gona ba na le seriti sa go lekana le ditokelo. Ba filwe monagano le letswalo mme ba swanetše go swarana ka moya wa bana ba mpa.", "metadata": { "languages": [ "tgl", @@ -9292,7 +9292,7 @@ { "type": "UncategorizedText", "element_id": "51733b425e93924dbea419a28d2ee3d2", - "text": "South Azerbaijani Tu\u0308m insanlar hu\u0308r do\u0308g\u0306arlar, hak ve onur bak\u0131m\u0131ndan es\u0327it do\u0308g\u0306arlar, onlar ak\u0131l ve vicdana sahiptirler ve birbirlerine kars\u0327\u0131 kardes\u0327lik ruhu ic\u0327inde davranmal\u0131lar.", + "text": "South Azerbaijani Tüm insanlar hür döğarlar, hak ve onur bakımından eşit döğarlar, onlar akıl ve vicdana sahiptirler ve birbirlerine karşı kardeşlik ruhu içinde davranmalılar.", "metadata": { "languages": [ "tur" @@ -9313,7 +9313,7 @@ { "type": "NarrativeText", "element_id": "7c2e8d871037d3d152d88dc5510cb236", - "text": "Spanish Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como est\u00e1n de raz\u00f3n y conciencia, deben comportarse fraternalmente los unos con los otros.", + "text": "Spanish Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como están de razón y conciencia, deben comportarse fraternalmente los unos con los otros.", "metadata": { "languages": [ "spa" @@ -9334,7 +9334,7 @@ { "type": "NarrativeText", "element_id": "816bdd2e0af6f8cc514fe60150f4714b", - "text": "Spanish (resolution) Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como est\u00e1n de raz\u00f3n y conciencia, deben comportarse fraternalmente los unos con los otros.", + "text": "Spanish (resolution) Todos los seres humanos nacen libres e iguales en dignidad y derechos y, dotados como están de razón y conciencia, deben comportarse fraternalmente los unos con los otros.", "metadata": { "languages": [ "spa" @@ -9398,7 +9398,7 @@ { "type": "NarrativeText", "element_id": "cf93d32f84284c9d205953f2720290ba", - "text": "Susu Adamadie birin barixin\u025b e lan y\u025bt\u025bralui kui, y\u025bt\u025b kolonyi nun y\u025bt\u025b suxu kima. Fondoe nun faxamui na e b\u025b boresuxu kima bariboreya fanyi kui.", + "text": "Susu Adamadie birin barixinɛ e lan yɛtɛralui kui, yɛtɛ kolonyi nun yɛtɛ suxu kima. Fondoe nun faxamui na e bɛ boresuxu kima bariboreya fanyi kui.", "metadata": { "languages": [ "som", @@ -9464,7 +9464,7 @@ { "type": "NarrativeText", "element_id": "962be1c35a09978ec0be3e93852b6925", - "text": "Swedish Alla m\u00e4nniskor \u00e4ro f\u00f6dda fria och lika i v\u00e4rde och r\u00e4ttigheter. De \u00e4ro utrustade med f\u00f6rnuft och samvete och b\u00f6ra handla gentemot varandra i en anda av broderskap.", + "text": "Swedish Alla människor äro födda fria och lika i värde och rättigheter. De äro utrustade med förnuft och samvete och böra handla gentemot varandra i en anda av broderskap.", "metadata": { "languages": [ "swe" @@ -9506,7 +9506,7 @@ { "type": "NarrativeText", "element_id": "1fbce46911c4817cf2f0bf0db19d2f32", - "text": "Tagalog (Tagalog) \u1700\u1705 \u170e\u1711\u1706\u1714 \u1705 \u1706\u1702\u170c\u1714 \u1701\u1710\u1712\u1708\u1712\u170e\u1705 \u1708 \u170b\u170e\u170c \u1700\u1706\u1714 \u1709\u1708\u1714\u1706\u170c\u1714 \u1709\u1708\u1714\u1706\u170c\u1714 \u1710 \u1703\u1707\u1705\u170e\u1708\u1714 \u1700\u1706\u1714 \u170b\u1714\u1704 \u1703\u1707\u1713\u1709\u1706\u1708\u1714\u1736 \u1710\u1712\u170e\u170c\u1714 \u1709\u1712\u1708\u1704\u1714\u1703\u170e\u1713\u170a\u1708\u1714 \u1705 \u1703\u1706\u1714\u170f\u1712\u1707\u1708\u1714 \u1700\u1706\u1714 \u170a\u1713\u1707\u1714\u1711\u1712 \u1700\u1706\u1714 \u1707\u1709\u1706\u1714 \u170b\u1704\u1714\u1709\u170e\u1704\u170c\u1708\u1714 \u1700\u1705 \u1701\u1710\u1706\u1714 \u1701\u1710 \u1710 \u1707\u1712\u170f \u1705 \u1709\u1704\u1714\u1703\u1703\u1709\u1706\u1712\u1707\u1708\u1714\u1736", + "text": "Tagalog (Tagalog) ᜀᜅ ᜎᜑᜆ᜔ ᜅ ᜆᜂᜌ᜔ ᜁᜐᜒᜈᜒᜎᜅ ᜈ ᜋᜎᜌ ᜀᜆ᜔ ᜉᜈ᜔ᜆᜌ᜔ ᜉᜈ᜔ᜆᜌ᜔ ᜐ ᜃᜇᜅᜎᜈ᜔ ᜀᜆ᜔ ᜋ᜔ᜄ ᜃᜇᜓᜉᜆᜈ᜔᜶ ᜐᜒᜎᜌ᜔ ᜉᜒᜈᜄ᜔ᜃᜎᜓᜊᜈ᜔ ᜅ ᜃᜆ᜔ᜏᜒᜇᜈ᜔ ᜀᜆ᜔ ᜊᜓᜇ᜔ᜑᜒ ᜀᜆ᜔ ᜇᜉᜆ᜔ ᜋᜄ᜔ᜉᜎᜄᜌᜈ᜔ ᜀᜅ ᜁᜐᜆ᜔ ᜁᜐ ᜐ ᜇᜒᜏ ᜅ ᜉᜄ᜔ᜃᜃᜉᜆᜒᜇᜈ᜔᜶", "metadata": { "filetype": "text/plain", "data_source": { @@ -9524,7 +9524,7 @@ { "type": "NarrativeText", "element_id": "f80202b3162be68cd2957c5c564ddc03", - "text": "Tahitian E fanauhia te t\u0101'\u0101to'ara'a o te ta'ata-tupu ma te ti'am\u0101 e te ti'amanara'a 'aifaito. Ua '\u012b te mana'o pa'ari e i te manava e ma te 'a'au taea'e 'oia ta ratou ha'a i rotop\u016b ia ratou iho, e ti'a ai;", + "text": "Tahitian E fanauhia te tā'āto'ara'a o te ta'ata-tupu ma te ti'amā e te ti'amanara'a 'aifaito. Ua 'ī te mana'o pa'ari e i te manava e ma te 'a'au taea'e 'oia ta ratou ha'a i rotopū ia ratou iho, e ti'a ai;", "metadata": { "languages": [ "ita" @@ -9545,7 +9545,7 @@ { "type": "UncategorizedText", "element_id": "b5b3558a1982151293ab4f2c745e943b", - "text": "Tai Dam \uaab9\uaa95\uaab8\uaa89 \uaa80\uaab1 \uaa8b\uaab4 \uaadb \uaa8e\uaab2\uaa89 \uaaae\uaaae\uaa80 \uaaa3\uaab1 \uaabb\uaaa0 \uaa81\uaab7 \uaabb\uaaac \uaabc\uaa92 \uaa95\uaab3 \uaa95\uaab1\uaa89 \uaa80\uaabe\uaa9a \uaab9\uaa8b\uaab7\uaa89 \uaa9d\uaab8\uaa89 \uaa95\uaaae\uaaa5 \uaaa9\uaabe \uaadb \uaab6\uaa94\uaa99 \uaaa0\uaab4 - \uaa8b\uaab4 \uaaac\uaaba \uaadb \uaabb\uaaa0 \uaa81\uaab7 \uaabb\uaaac \uaaa3\uaab2 \uaa81\uaaab\uaab8\uaa99 \uaa8e\uaab1\uaa89 \uaab6\uaa8e\uaaa3 \uaaa9\uaaba\uaa89 \uaab9\uaaa5\uaab8\uaa92 \uaadb \uaa80\uaabe\uaa9a \uaab9\uaaa5\uaab8\uaa92 \uaabb\uaa8a \uaa9a\uaab4\uaa99 \uaa80\uaabe\uaa9a \uaabc\uaa92 \uaab9\uaa9a\uaab7\uaa89 \uaa92\uaab2 \uaa80\uaabe\uaa9a \uaaab\uaab8\uaa80 \uaaad\uaab0\uaa80 \uaab5\uaa9d\uaa89 \uaab9\uaa8f\uaa89 \uaab9\uaaad\uaa99 \uaa92\uaab8\uaaab.", + "text": "Tai Dam ꪹꪕꪸꪉ ꪀꪱ ꪋꪴ ꫛ ꪎꪲꪉ ꪮꪮꪀ ꪣꪱ ꪻꪠ ꪁꪷ ꪻꪬ ꪼꪒ ꪕꪳ ꪕꪱꪉ ꪀꪾꪚ ꪹꪋꪷꪉ ꪝꪸꪉ ꪕꪮꪥ ꪩꪾ ꫛ ꪶꪔꪙ ꪠꪴ - ꪋꪴ ꪬꪺ ꫛ ꪻꪠ ꪁꪷ ꪻꪬ ꪣꪲ ꪁꪫꪸꪙ ꪎꪱꪉ ꪶꪎꪣ ꪩꪺꪉ ꪹꪥꪸꪒ ꫛ ꪀꪾꪚ ꪹꪥꪸꪒ ꪻꪊ ꪚꪴꪙ ꪀꪾꪚ ꪼꪒ ꪹꪚꪷꪉ ꪒꪲ ꪀꪾꪚ ꪫꪸꪀ ꪭꪰꪀ ꪵꪝꪉ ꪹꪏꪉ ꪹꪭꪙ ꪒꪸꪫ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -9563,7 +9563,7 @@ { "type": "NarrativeText", "element_id": "424be8d53e2447fd43a7df9c88610eb3", - "text": "Tajiki \u0422\u0430\u043c\u043e\u043c\u0438 \u043e\u0434\u0430\u043c\u043e\u043d \u043e\u0437\u043e\u0434 \u0432\u0430 \u0430\u0437 \u043b\u0438\u04b3\u043e\u0437\u0438 \u0448\u0430\u0440\u0430\u0444\u0443 \u04b3\u0443\u049b\u0443\u049b \u0431\u0430 \u04b3\u0430\u043c \u0431\u0430\u0440\u043e\u0431\u0430\u0440 \u0431\u0430 \u0434\u0443\u043d\u0451 \u043c\u0435\u043e\u044f\u043d\u0434. \u041e\u043d\u04b3\u043e \u0441\u043e\u04b3\u0438\u0431\u0438 \u0430\u049b\u043b\u0443 \u0432\u0438\u04b7\u0434\u043e\u043d\u0430\u043d\u0434 \u0432\u0430 \u0431\u043e\u044f\u0434 \u0431\u043e \u044f\u043a\u0434\u0438\u0433\u0430\u0440 \u043c\u0443\u043d\u043e\u0441\u0438\u0431\u0430\u0442\u0438 \u0431\u0430\u0440\u043e\u0434\u0430\u0440\u043e\u043d\u0430 \u0434\u043e\u0448\u0442\u0430 \u0431\u043e\u0448\u0430\u043d\u0434.", + "text": "Tajiki Тамоми одамон озод ва аз лиҳози шарафу ҳуқуқ ба ҳам баробар ба дунё меоянд. Онҳо соҳиби ақлу виҷдонанд ва бояд бо якдигар муносибати бародарона дошта бошанд.", "metadata": { "languages": [ "mkd", @@ -9586,7 +9586,7 @@ { "type": "NarrativeText", "element_id": "30aa2c0edeca02853a028f15110a6827", - "text": "Talysh H\u0259mm\u0259 insonon b\u0259\u015ft\u0259 l\u0259yo\u011f\u0259ti iy\u0259n h\u0259xonro ozod iy\u0259n b\u0259rob\u0259r movard\u0259 bed\u0259n. \u00c7\u0259von \u015fuur iy\u0259n vicdon hese, \u0259ve ki, dey\u0259nd\u0131 m\u0131nasib\u0259t\u0259d\u0259 b\u0259n\u0259 b\u0131v\u0259 r\u0259ftor kard\u0259ninin.", + "text": "Talysh Həmmə insonon bəştə ləyoğəti iyən həxonro ozod iyən bərobər movardə bedən. Çəvon şuur iyən vicdon hese, əve ki, deyəndı mınasibətədə bənə bıvə rəftor kardəninin.", "metadata": { "languages": [ "tur" @@ -9607,7 +9607,7 @@ { "type": "UncategorizedText", "element_id": "615dde6386c8f1b795ccd07901216ce7", - "text": "Tamang, Eastern \u092e\u094d\u0939\u094b\u0915\u094d\u0915\u094b\u0928 (\u0917\u094b\u0926\u094b\u092a) \u0928\u094b\u0928 \u092e\u094d\u0939\u0940\u092e \u0915\u0947\u092a\u093e\u0928\u094d\u0939\u093e\u092a\u093e \u0939\u0947\u0928\u094d\u091b\u0947 \u0928\u0941\u0928 \u0939\u093e\u0919\u092a\u093e\u0919\u0935\u093e (\u0938\u094d\u0935\u0924\u0928\u094d\u0924\u094d\u0930) \u092f\u093e\u0919\u0935\u093e \u0939\u0940\u0928\u094d\u0928\u093e \u0964 \u0925\u0947 \u092e\u094d\u0939\u094b\u0915\u094d\u0915\u094b\u0928\u0932\u093e (\u0917\u094b\u0926\u094b\u092a\u0932\u093e) \u091a\u094d\u092f\u094b\u091a\u094d\u092f\u094b \u092f\u093e\u0919\u0924\u093e\u092e \u0925\u0947\u0928 \u092e\u0939\u0924\u094d\u0935 \u092e\u0941\u0932\u093e \u0964 \u0925\u0947\u0928\u0940\u0915\u093e\u0926\u0947\u0930\u0940 \u0938\u0947\u092e\u092c\u093e\u0919 (\u0935\u093f\u091a\u093e\u0930 \u0936\u0915\u094d\u0924\u093f) \u0926\u0947\u0928 \u0925\u0941-\u0938\u0947\u092e\u0938\u093e\u0919 \u092e\u0941\u092c\u093e\u0938\u0947 \u0925\u0947\u0928\u0940\u091c\u0941\u0917\u0941\u0938\u0947 \u0939\u094d\u0930\u093e\u0919\u0928\u094d\u0939\u093e\u0919\u0930\u0940 \u0928\u0941\u0928 \u0925\u0947\u0924\u094d\u092e\u093e\u0932\u093e \u0938\u0947\u092e\u0932\u0947\u0919\u092e\u094b\u0917\u094d\u092f\u093e\u092e\u094d\u0938\u0947 (\u092d\u0935\u0928\u093e\u092c\u093e\u091f) \u0917\u094d\u092f\u0947 \u0932\u093e\u0924\u094b\u092c\u093e\u0928 \u092e\u0941\u0932\u093e \u0964", + "text": "Tamang, Eastern म्होक्कोन (गोदोप) नोन म्हीम केपान्हापा हेन्छे नुन हाङपाङवा (स्वतन्त्र) याङवा हीन्ना । थे म्होक्कोनला (गोदोपला) च्योच्यो याङताम थेन महत्व मुला । थेनीकादेरी सेमबाङ (विचार शक्ति) देन थु-सेमसाङ मुबासे थेनीजुगुसे ह्राङन्हाङरी नुन थेत्माला सेमलेङमोग्याम्से (भवनाबाट) ग्ये लातोबान मुला ।", "metadata": { "languages": [ "nep" @@ -9628,7 +9628,7 @@ { "type": "NarrativeText", "element_id": "f484ee723443631e755f61ec59737260", - "text": "Tamazight, Central Atlas Imdanen, akken ma llan ttlalen d ilelliyen msawan di lh\u0323wer\u0323ma d yizerfan- ghur sen tamsakwit d l\u00e2quel u yessefk ad-tili tegmatt gar asen.", + "text": "Tamazight, Central Atlas Imdanen, akken ma llan ttlalen d ilelliyen msawan di lḥweṛma d yizerfan- ghur sen tamsakwit d lâquel u yessefk ad-tili tegmatt gar asen.", "metadata": { "languages": [ "tur", @@ -9650,7 +9650,7 @@ { "type": "UncategorizedText", "element_id": "4fa699fe9b09ce455b4b7a0eceac23a4", - "text": "Tamazight, Central Atlas (Tifinagh) \u2d49\u2d4e\u2d37\u2d30\u2d4f\u2d3b\u2d4f, \u2d30\u2d3d\u2d3d\u2d3b\u2d4f \u2d4e\u2d30 \u2d4d\u2d4d\u2d30\u2d4f \u2d5c\u2d5c\u2d4d\u2d30\u2d4d\u2d3b\u2d4f \u2d37 \u2d49\u2d4d\u2d3b\u2d4d\u2d4d\u2d49\u2d62\u2d3b\u2d4f \u2d4e\u2d59\u2d30\u2d61\u2d30\u2d4f \u2d37\u2d49 \u2d4d\u2d43\u2d61\u2d3b\u2d55\u2d4e\u2d30 \u2d37 \u2d62\u2d49\u2d63\u2d3b\u2d54\u2d3c\u2d30\u2d4f-\u2d56\u2d53\u2d54 \u2d59\u2d3b\u2d4f \u2d5c\u2d30\u2d4e\u2d59\u2d30\u2d3d\u2d61\u2d49\u2d5c \u2d37 \u2d4d\u2d30\u2d47\u2d53\u2d3b\u2d4d \u2d53 \u2d62\u2d3b\u2d59\u2d59\u2d3b\u2d3c\u2d3d \u2d30\u2d37-\u2d5c\u2d49\u2d4d\u2d49 \u2d5c\u2d3b\u2d33\u2d4e\u2d30\u2d5c\u2d5c \u2d33\u2d30\u2d54 \u2d30\u2d59\u2d3b\u2d4f.", + "text": "Tamazight, Central Atlas (Tifinagh) ⵉⵎⴷⴰⵏⴻⵏ, ⴰⴽⴽⴻⵏ ⵎⴰ ⵍⵍⴰⵏ ⵜⵜⵍⴰⵍⴻⵏ ⴷ ⵉⵍⴻⵍⵍⵉⵢⴻⵏ ⵎⵙⴰⵡⴰⵏ ⴷⵉ ⵍⵃⵡⴻⵕⵎⴰ ⴷ ⵢⵉⵣⴻⵔⴼⴰⵏ-ⵖⵓⵔ ⵙⴻⵏ ⵜⴰⵎⵙⴰⴽⵡⵉⵜ ⴷ ⵍⴰⵇⵓⴻⵍ ⵓ ⵢⴻⵙⵙⴻⴼⴽ ⴰⴷ-ⵜⵉⵍⵉ ⵜⴻⴳⵎⴰⵜⵜ ⴳⴰⵔ ⴰⵙⴻⵏ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -9668,7 +9668,7 @@ { "type": "UncategorizedText", "element_id": "c36059cd99076234366c10f07f278260", - "text": "Tamazight, Standard Morocan \u2d30\u2d54 \u2d37 \u2d5c\u2d5c\u2d4d\u2d30\u2d4d\u2d30\u2d4f \u2d4e\u2d49\u2d37\u2d37\u2d4f \u2d33\u2d30\u2d4f \u2d49\u2d4d\u2d3b\u2d4d\u2d4d\u2d49\u2d5c\u2d4f \u2d4e\u2d33\u2d30\u2d37\u2d37\u2d30\u2d4f \u2d56 \u2d61\u2d30\u2d37\u2d37\u2d53\u2d54 \u2d37 \u2d49\u2d63\u2d54\u2d3c\u2d30\u2d4f, \u2d62\u2d49\u2d4d\u2d49 \u2d30\u2d3d\u2d6f \u2d37\u2d30\u2d54\u2d59\u2d4f \u2d53\u2d4f\u2d4d\u2d4d\u2d49 \u2d37 \u2d53\u2d3c\u2d54\u2d30\u2d3d, \u2d49\u2d4d\u2d4d\u2d30 \u2d3c\u2d4d\u2d4d\u2d30 \u2d59\u2d4f \u2d30\u2d37 \u2d5c\u2d5c\u2d4e\u2d62\u2d30\u2d61\u2d30\u2d59\u2d4f \u2d4f\u2d33\u2d54\u2d30\u2d5c\u2d59\u2d4f \u2d59 \u2d5c\u2d30\u2d33\u2d4e\u2d30\u2d5c.", + "text": "Tamazight, Standard Morocan ⴰⵔ ⴷ ⵜⵜⵍⴰⵍⴰⵏ ⵎⵉⴷⴷⵏ ⴳⴰⵏ ⵉⵍⴻⵍⵍⵉⵜⵏ ⵎⴳⴰⴷⴷⴰⵏ ⵖ ⵡⴰⴷⴷⵓⵔ ⴷ ⵉⵣⵔⴼⴰⵏ, ⵢⵉⵍⵉ ⴰⴽⵯ ⴷⴰⵔⵙⵏ ⵓⵏⵍⵍⵉ ⴷ ⵓⴼⵔⴰⴽ, ⵉⵍⵍⴰ ⴼⵍⵍⴰ ⵙⵏ ⴰⴷ ⵜⵜⵎⵢⴰⵡⴰⵙⵏ ⵏⴳⵔⴰⵜⵙⵏ ⵙ ⵜⴰⴳⵎⴰⵜ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -9686,7 +9686,7 @@ { "type": "NarrativeText", "element_id": "703b672337c499aededf6f6696d6522f", - "text": "Tamil \u0bae\u0ba9\u0bbf\u0ba4\u0baa\u0bcd \u0baa\u0bbf\u0bb1\u0bbf\u0bb5\u0bbf\u0baf\u0bbf\u0ba9\u0bb0\u0bcd \u0b9a\u0b95\u0bb2\u0bb0\u0bc1\u0bae\u0bcd \u0b9a\u0bc1\u0ba4\u0ba8\u0bcd\u0ba4\u0bbf\u0bb0\u0bae\u0bbe\u0b95\u0bb5\u0bc7 \u0baa\u0bbf\u0bb1\u0b95\u0bcd\u0b95\u0bbf\u0ba9\u0bcd\u0bb1\u0ba9\u0bb0\u0bcd; \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0bae\u0ba4\u0bbf\u0baa\u0bcd\u0baa\u0bbf\u0bb2\u0bc1\u0bae\u0bcd, \u0b89\u0bb0\u0bbf\u0bae\u0bc8\u0b95\u0bb3\u0bbf\u0bb2\u0bc1\u0bae\u0bcd \u0b9a\u0bae\u0bae\u0bbe\u0ba9\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd, \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0ba8\u0bbf\u0baf\u0bbe\u0baf\u0ba4\u0bcd\u0ba4\u0bc8\u0baf\u0bc1\u0bae\u0bcd \u0bae\u0ba9\u0b9a\u0bcd\u0b9a\u0bbe\u0b9f\u0bcd\u0b9a\u0bbf\u0baf\u0bc8\u0baf\u0bc1\u0bae\u0bcd \u0b87\u0baf\u0bb1\u0bcd\u0baa\u0ba3\u0bcd\u0baa\u0bbe\u0b95\u0baa\u0bcd \u0baa\u0bc6\u0bb1\u0bcd\u0bb1\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd. \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0b92\u0bb0\u0bc1\u0bb5\u0bb0\u0bc1\u0b9f\u0ba9\u0bca\u0bb0\u0bc1\u0bb5\u0bb0\u0bcd \u0b9a\u0b95\u0bcb\u0ba4\u0bb0 \u0b89\u0ba3\u0bb0\u0bcd\u0bb5\u0bc1\u0baa\u0bcd \u0baa\u0bbe\u0b99\u0bcd\u0b95\u0bbf\u0bb2\u0bcd \u0ba8\u0b9f\u0ba8\u0bcd\u0ba4\u0bc1\u0b95\u0bca\u0bb3\u0bcd\u0bb3\u0bb2\u0bcd \u0bb5\u0bc7\u0ba3\u0bcd\u0b9f\u0bc1\u0bae\u0bcd.", + "text": "Tamil மனிதப் பிறிவியினர் சகலரும் சுதந்திரமாகவே பிறக்கின்றனர்; அவர்கள் மதிப்பிலும், உரிமைகளிலும் சமமானவர்கள், அவர்கள் நியாயத்தையும் மனச்சாட்சியையும் இயற்பண்பாகப் பெற்றவர்கள். அவர்கள் ஒருவருடனொருவர் சகோதர உணர்வுப் பாங்கில் நடந்துகொள்ளல் வேண்டும்.", "metadata": { "languages": [ "tam" @@ -9707,7 +9707,7 @@ { "type": "NarrativeText", "element_id": "cd3e1810510aee192781e40eae1b0ddc", - "text": "Tamil (Sri Lanka) \u0bae\u0ba9\u0bbf\u0ba4\u0baa\u0bcd \u0baa\u0bbf\u0bb1\u0bbf\u0bb5\u0bbf\u0baf\u0bbf\u0ba9\u0bb0\u0bcd \u0b9a\u0b95\u0bb2\u0bb0\u0bc1\u0bae\u0bcd \u0b9a\u0bc1\u0ba4\u0ba8\u0bcd\u0ba4\u0bbf\u0bb0\u0bae\u0bbe\u0b95\u0bb5\u0bc7 \u0baa\u0bbf\u0bb1\u0b95\u0bcd\u0b95\u0bbf\u0ba9\u0bcd\u0bb1\u0ba9\u0bb0\u0bcd; \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0bae\u0ba4\u0bbf\u0baa\u0bcd\u0baa\u0bbf\u0bb2\u0bc1\u0bae\u0bcd, \u0b89\u0bb0\u0bbf\u0bae\u0bc8\u0b95\u0bb3\u0bbf\u0bb2\u0bc1\u0bae\u0bcd \u0b9a\u0bae\u0bae\u0bbe\u0ba9\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd, \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0ba8\u0bbf\u0baf\u0bbe\u0baf\u0ba4\u0bcd\u0ba4\u0bc8\u0baf\u0bc1\u0bae\u0bcd \u0bae\u0ba9\u0b9a\u0bcd\u0b9a\u0bbe\u0b9f\u0bcd\u0b9a\u0bbf\u0baf\u0bc8\u0baf\u0bc1\u0bae\u0bcd \u0b87\u0baf\u0bb1\u0bcd\u0baa\u0ba3\u0bcd\u0baa\u0bbe\u0b95\u0baa\u0bcd \u0baa\u0bc6\u0bb1\u0bcd\u0bb1\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd. \u0b85\u0bb5\u0bb0\u0bcd\u0b95\u0bb3\u0bcd \u0b92\u0bb0\u0bc1\u0bb5\u0bb0\u0bc1\u0b9f\u0ba9\u0bca\u0bb0\u0bc1\u0bb5\u0bb0\u0bcd \u0b9a\u0b95\u0bcb\u0ba4\u0bb0 \u0b89\u0ba3\u0bb0\u0bcd\u0bb5\u0bc1\u0baa\u0bcd \u0baa\u0bbe\u0b99\u0bcd\u0b95\u0bbf\u0bb2\u0bcd \u0ba8\u0b9f\u0ba8\u0bcd\u0ba4\u0bc1\u0b95\u0bca\u0bb3\u0bcd\u0bb3\u0bb2\u0bcd \u0bb5\u0bc7\u0ba3\u0bcd\u0b9f\u0bc1\u0bae\u0bcd.", + "text": "Tamil (Sri Lanka) மனிதப் பிறிவியினர் சகலரும் சுதந்திரமாகவே பிறக்கின்றனர்; அவர்கள் மதிப்பிலும், உரிமைகளிலும் சமமானவர்கள், அவர்கள் நியாயத்தையும் மனச்சாட்சியையும் இயற்பண்பாகப் பெற்றவர்கள். அவர்கள் ஒருவருடனொருவர் சகோதர உணர்வுப் பாங்கில் நடந்துகொள்ளல் வேண்டும்.", "metadata": { "languages": [ "tam" @@ -9728,7 +9728,7 @@ { "type": "NarrativeText", "element_id": "9e55ede50aefd9018f64126e5d20a259", - "text": "Tatar \u0411\u0430\u0440\u043b\u044b\u043a \u043a\u0435\u0448\u0435\u043b\u04d9\u0440 \u0434\u04d9 \u0430\u0437\u0430\u0442 \u04bb\u04d9\u043c \u04af\u0437 \u0430\u0431\u0440\u0443\u0439\u043b\u0430\u0440\u044b \u04bb\u04d9\u043c \u0445\u043e\u043a\u0443\u043a\u043b\u0430\u0440\u044b \u044f\u0433\u044b\u043d\u043d\u0430\u043d \u0442\u0438\u04a3 \u0431\u0443\u043b\u044b\u043f \u0442\u0443\u0430\u043b\u0430\u0440. \u0410\u043b\u0430\u0440\u0433\u0430 \u0430\u043a\u044b\u043b \u04bb\u04d9\u043c \u0432\u04e9\u0497\u0434\u0430\u043d \u0431\u0438\u0440\u0435\u043b\u0433\u04d9\u043d \u04bb\u04d9\u043c \u0431\u0435\u0440-\u0431\u0435\u0440\u0441\u0435\u043d\u04d9 \u043a\u0430\u0440\u0430\u0442\u0430 \u0442\u0443\u0433\u0430\u043d\u0430\u0440\u0447\u0430 [\u0442\u0443\u0433\u0430\u043d\u043d\u0430\u0440\u0447\u0430] \u043c\u04e9\u043d\u04d9\u0441\u04d9\u0431\u04d9\u0442\u0442\u04d9 \u0431\u0443\u043b\u044b\u0440\u0433\u0430 \u0442\u0438\u0435\u0448\u043b\u04d9\u0440.", + "text": "Tatar Барлык кешеләр дә азат һәм үз абруйлары һәм хокуклары ягыннан тиң булып туалар. Аларга акыл һәм вөҗдан бирелгән һәм бер-берсенә карата туганарча [туганнарча] мөнәсәбәттә булырга тиешләр.", "metadata": { "languages": [ "rus" @@ -9749,7 +9749,7 @@ { "type": "NarrativeText", "element_id": "ca7b2ef61ad3e52b7b7873feb9ba85c1", - "text": "Telugu \u0c2a\u0c4d\u0c30\u0c24\u0c3f\u0c2a\u0c24\u0c4d\u0c24\u0c3f\u0c38\u0c4d\u0c35\u0c24\u0c4d\u0c35\u0c2e\u0c41\u0c32 \u0c35\u0c3f\u0c37\u0c2f\u0c2e\u0c41\u0c28 \u0c2e\u0c3e\u0c28\u0c35\u0c41\u0c32\u0c46\u0c32\u0c4d\u0c32\u0c30\u0c41\u0c28\u0c41 \u0c1c\u0c28\u0c4d\u0c2e\u0c24\u0c03 \u0c38\u0c4d\u0c35\u0c24\u0c02\u0c24\u0c4d\u0c30\u0c41\u0c32\u0c41\u0c28\u0c41 \u0c38\u0c2e\u0c3e\u0c28\u0c41\u0c32\u0c41\u0c28\u0c41 \u0c28\u0c17\u0c41\u0c26\u0c41\u0c30\u0c41. \u0c35\u0c3e\u0c30\u0c41 \u0c35\u0c3f\u0c35\u0c47\u0c1a\u0c28-\u0c05\u0c02\u0c24\u0c03\u0c15\u0c30\u0c23 \u0c38\u0c02\u0c2a\u0c28\u0c4d\u0c28\u0c41\u0c32\u0c17\u0c41\u0c1f\u0c1a\u0c47 \u0c2a\u0c30\u0c38\u0c4d\u0c2a\u0c30\u0c2e\u0c41 \u0c2d\u0c4d\u0c30\u0c3e\u0c24\u0c43\u0c2d\u0c3e\u0c35\u0c2e\u0c41\u0c24\u0c4b \u0c35\u0c30\u0c4d\u0c24\u0c3f\u0c02\u0c2a\u0c35\u0c32\u0c2f\u0c41\u0c28\u0c41.", + "text": "Telugu ప్రతిపత్తిస్వత్వముల విషయమున మానవులెల్లరును జన్మతః స్వతంత్రులును సమానులును నగుదురు. వారు వివేచన-అంతఃకరణ సంపన్నులగుటచే పరస్పరము భ్రాతృభావముతో వర్తింపవలయును.", "metadata": { "languages": [ "tel" @@ -9770,7 +9770,7 @@ { "type": "NarrativeText", "element_id": "8947e9ec5ba76eabce3e2d1e59437be7", - "text": "Tem B\u00e1nl\u028ar\u028a\u0301\u028a \u0269r\u028a\u0301 b\u00e1a ween\u00ed na kez\u00e9\u0144b\u00ed\u00eddi g\u025b b\u0269ka b\u025bd\u025b\u0301\u025b \u0256\u0254\u0254z\u0269\u0301t\u0269 na y\u00edkow\u00e1 k\u025bg\u025b\u0301\u025b \u0256\u00e9y\u00ed-\u0256\u00e9y\u00ed g\u025b. B\u0254w\u025bn\u00e1 laak\u00e1r\u0269 na \u0269r\u028a\u0301t\u0269 b\u0269ka b\u0269\u0269b\u0254\u0301\u0254\u0301z\u0269 b\u0254c\u0254\u0254n\u00e1 \u0256am\u00e1 koob\u00edre c\u0254w\u028ar\u025b.", + "text": "Tem Bánlʊrʊ́ʊ ɩrʊ́ báa weení na kezéńbíídi gɛ bɩka bɛdɛ́ɛ ɖɔɔzɩ́tɩ na yíkowá kɛgɛ́ɛ ɖéyí-ɖéyí gɛ. Bɔwɛná laakárɩ na ɩrʊ́tɩ bɩka bɩɩbɔ́ɔ́zɩ bɔcɔɔná ɖamá koobíre cɔwʊrɛ.", "metadata": { "languages": [ "ces" @@ -9857,7 +9857,7 @@ { "type": "Title", "element_id": "70fb4fd148b0adc870bad4cf3a004e9e", - "text": "\u0e21\u0e19\u0e38\u0e29\u0e22\u0e4c\u0e17\u0e31\u0e49\u0e07\u0e2b\u0e25\u0e32\u0e22\u0e40\u0e01\u0e34\u0e14\u0e21\u0e32\u0e21\u0e35\u0e2d\u0e34\u0e2a\u0e23\u0e30\u0e41\u0e25\u0e30\u0e40\u0e2a\u0e21\u0e2d\u0e20\u0e32\u0e04\u0e01\u0e31\u0e19\u0e43\u0e19\u0e40\u0e01\u0e35\u0e22\u0e23\u0e15\u0e34\u0e28\u0e31\u0e01\u0e14[\u0e40\u0e01\u0e35\u0e22\u0e23\u0e15\u0e34\u0e28\u0e31\u0e01\u0e14\u0e34\u0e4c]\u0e41\u0e25\u0e30\u0e2a\u0e34\u0e17\u0e18\u0e34 \u0e15\u0e48\u0e32\u0e07\u0e21\u0e35\u0e40\u0e2b\u0e15\u0e38\u0e1c\u0e25\u0e41\u0e25\u0e30\u0e21\u0e42\u0e19\u0e18\u0e23\u0e23\u0e21 \u0e41\u0e25\u0e30\u0e04\u0e27\u0e23\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e15\u0e48\u0e2d\u0e01\u0e31\u0e19\u0e14\u0e49\u0e27\u0e22\u0e40\u0e08\u0e15\u0e19\u0e32\u0e23\u0e21\u0e13\u0e4c\u0e41\u0e2b\u0e48\u0e07\u0e20\u0e23\u0e32\u0e14\u0e23\u0e20\u0e32\u0e1e", + "text": "มนุษย์ทั้งหลายเกิดมามีอิสระและเสมอภาคกันในเกียรติศักด[เกียรติศักดิ์]และสิทธิ ต่างมีเหตุผลและมโนธรรม และควรปฏิบัติต่อกันด้วยเจตนารมณ์แห่งภราดรภาพ", "metadata": { "languages": [ "tha" @@ -9899,7 +9899,7 @@ { "type": "Title", "element_id": "a4b136507e5ed6666129c7a44794fd18", - "text": "\u0e21\u0e19\u0e38\u0e29\u0e22\u0e4c\u0e17\u0e31\u0e49\u0e07\u0e1b\u0e27\u0e07\u0e40\u0e01\u0e34\u0e14\u0e21\u0e32\u0e21\u0e35\u0e2d\u0e34\u0e2a\u0e23\u0e30\u0e41\u0e25\u0e30\u0e40\u0e2a\u0e21\u0e2d\u0e20\u0e32\u0e04\u0e01\u0e31\u0e19\u0e43\u0e19\u0e28\u0e31\u0e01\u0e14\u0e34\u0e4c\u0e28\u0e23\u0e35\u0e41\u0e25\u0e30\u0e2a\u0e34\u0e17\u0e18\u0e34 \u0e15\u0e48\u0e32\u0e07\u0e43\u0e19\u0e15\u0e19\u0e21\u0e35\u0e40\u0e2b\u0e15\u0e38\u0e1c\u0e25\u0e41\u0e25\u0e30\u0e21\u0e42\u0e19\u0e18\u0e23\u0e23\u0e21 \u0e41\u0e25\u0e30\u0e04\u0e27\u0e23\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e15\u0e48\u0e2d\u0e01\u0e31\u0e19\u0e14\u0e49\u0e27\u0e22\u0e08\u0e34\u0e15\u0e27\u0e34\u0e0d\u0e0d\u0e32\u0e13\u0e41\u0e2b\u0e48\u0e07\u0e20\u0e23\u0e32\u0e14\u0e23\u0e20\u0e32\u0e1e", + "text": "มนุษย์ทั้งปวงเกิดมามีอิสระและเสมอภาคกันในศักดิ์ศรีและสิทธิ ต่างในตนมีเหตุผลและมโนธรรม และควรปฏิบัติต่อกันด้วยจิตวิญญาณแห่งภราดรภาพ", "metadata": { "languages": [ "tha" @@ -9920,7 +9920,7 @@ { "type": "NarrativeText", "element_id": "8f52798dd21c8472bda701088f7e82ca", - "text": "Themne A kom a\u014bf\u0259m ak\u0259pet b\u025b \u014ba ath\u0259n\u028cn\u025b yi r\u028cwankom. \u0186wa a\u014b ba m\u0259mari m\u0259th\u0259n\u028cn\u025b. \u0186wa a\u014b ba m\u0259fith yi t\u0259chemp. Chiya\u014b, a\u014b yi t\u0259k\u0259 gbasi a\u014bkos \u014ba\u014b m\u0254 k\u0259pa \u014ba t\u0259kom.", + "text": "Themne A kom aŋfəm akəpet bɛ ŋa athənʌnɛ yi rʌwankom. Ɔwa aŋ ba məmari məthənʌnɛ. Ɔwa aŋ ba məfith yi təchemp. Chiyaŋ, aŋ yi təkə gbasi aŋkos ŋaŋ mɔ kəpa ŋa təkom.", "metadata": { "languages": [ "swa", @@ -9963,7 +9963,7 @@ { "type": "Title", "element_id": "9ff7c25da02c27eefccdaca502af53c1", - "text": "\u0f60\u0f42\u0fb2\u0f7c\u0f0b\u0f56\u0f0b\u0f58\u0f72\u0f60\u0f72\u0f0b\u0f62\u0f72\u0f42\u0f66\u0f0b\u0f62\u0f92\u0fb1\u0f74\u0f51\u0f0b\u0f61\u0f7c\u0f44\u0f66\u0f0b\u0f63\u0f0b\u0f66\u0f90\u0fb1\u0f7a\u0f66\u0f0b\u0f59\u0f58\u0f0b\u0f49\u0f72\u0f51\u0f0b\u0f53\u0f66\u0f0b\u0f46\u0f7a\u0f0b\u0f58\u0f50\u0f7c\u0f44\u0f66\u0f0b\u0f51\u0f44\u0f0c\u0f0d \u0f50\u0f7c\u0f56\u0f0b\u0f50\u0f44\u0f42\u0f72\u0f0b\u0f62\u0f44\u0f0b\u0f51\u0f56\u0f44\u0f0b\u0f60\u0f51\u0fb2\u0f0b\u0f58\u0f49\u0f58\u0f0b\u0f51\u0f74\u0f0b\u0f61\u0f7c\u0f51\u0f0b\u0f63\u0f0d \u0f41\u0f7c\u0f44\u0f0b\u0f5a\u0f7c\u0f62\u0f0b\u0f62\u0f44\u0f0b\u0f56\u0fb1\u0f74\u0f44\u0f0b\u0f42\u0f72\u0f0b\u0f56\u0fb3\u0f7c\u0f0b\u0f62\u0fa9\u0f63\u0f0b\u0f51\u0f44\u0f0b\u0f56\u0f66\u0f58\u0f0b\u0f5a\u0f74\u0f63\u0f0b\u0f56\u0f5f\u0f44\u0f0b\u0f54\u0f7c\u0f0b\u0f60\u0f51\u0f7c\u0f53\u0f0b\u0f54\u0f60\u0f72\u0f0b\u0f60\u0f7c\u0f66\u0f0b\u0f56\u0f56\u0f66\u0f0b\u0f40\u0fb1\u0f44\u0f0b\u0f61\u0f7c\u0f51\u0f0d \u0f51\u0f7a\u0f0b\u0f56\u0f5e\u0f72\u0f53\u0f0b\u0f55\u0f53\u0f0b\u0f5a\u0f74\u0f53\u0f0b\u0f42\u0f45\u0f72\u0f42\u0f0b\u0f42\u0f72\u0f66\u0f0b\u0f42\u0f45\u0f72\u0f42\u0f0b\u0f63\u0f0b\u0f56\u0f74\u0f0b\u0f66\u0fa4\u0f74\u0f53\u0f0b\u0f42\u0fb1\u0f72\u0f0b\u0f60\u0f51\u0f74\u0f0b\u0f64\u0f7a\u0f66\u0f0b\u0f60\u0f5b\u0f72\u0f53\u0f0b\u0f54\u0f60\u0f72\u0f0b\u0f56\u0fb1\u0f0b\u0f66\u0fa4\u0fb1\u0f7c\u0f51\u0f0b\u0f40\u0fb1\u0f44\u0f0b\u0f63\u0f42\u0f0b\u0f63\u0f7a\u0f53\u0f0b\u0f56\u0f66\u0f9f\u0f62\u0f0b\u0f51\u0f42\u0f7c\u0f66\u0f0b\u0f54\u0f0b\u0f61\u0f72\u0f53\u0f0e", + "text": "འགྲོ་བ་མིའི་རིགས་རྒྱུད་ཡོངས་ལ་སྐྱེས་ཙམ་ཉིད་ནས་ཆེ་མཐོངས་དང༌། ཐོབ་ཐངགི་རང་དབང་འདྲ་མཉམ་དུ་ཡོད་ལ། ཁོང་ཚོར་རང་བྱུང་གི་བློ་རྩལ་དང་བསམ་ཚུལ་བཟང་པོ་འདོན་པའི་འོས་བབས་ཀྱང་ཡོད། དེ་བཞིན་ཕན་ཚུན་གཅིག་གིས་གཅིག་ལ་བུ་སྤུན་གྱི་འདུ་ཤེས་འཛིན་པའི་བྱ་སྤྱོད་ཀྱང་ལག་ལེན་བསྟར་དགོས་པ་ཡིན༎", "metadata": { "filetype": "text/plain", "data_source": { @@ -9981,7 +9981,7 @@ { "type": "NarrativeText", "element_id": "8af88623529d7fac1f9e181cf1759b64", - "text": "Ticuna Ng\u1ebdxguma nabuxgu i du\u00fc\u0303x\u00fc\u0303g\u00fc r\u00fc gux\u00fc\u0303ma naw\u00fcxigu, r\u00fc tataxuma ya tex\u00e9 ya tog\u00fcar\u00fc yexera ix\u0129s\u1ebd. R\u00fc gux\u00fc\u0303ma nax\u00e3\u00e3\u1ebdg\u00fc r\u00fc ng\u1ebdmaca\u0331x r\u00fc name nix\u0129 na n\u00fcg\u00fcma\u00e3 namec\u00fcmax\u00fc\u0303 \u0129 gux\u00fc\u0303ma \u0129 du\u00fc\u0303x\u00fc\u0303g\u00fc.", + "text": "Ticuna Ngẽxguma nabuxgu i duü̃xü̃gü rü guxü̃ma nawüxigu, rü tataxuma ya texé ya togüarü yexera ixĩsẽ. Rü guxü̃ma naxããẽgü rü ngẽmaca̱x rü name nixĩ na nügümaã namecümaxü̃ ĩ guxü̃ma ĩ duü̃xü̃gü.", "metadata": { "languages": [ "tur", @@ -10005,7 +10005,7 @@ { "type": "UncategorizedText", "element_id": "3a1e54e52c1e8f2960b9f52ba81d5b61", - "text": "Tigrigna \u1265\u1218\u1295\u1345\u122d \u12ad\u1265\u122d\u1295 \u1218\u1230\u120d\u1295 \u12a9\u120e\u121d \u1230\u1263\u1275 \u12a5\u1295\u1275\u12cd\u1208\u12f1 \u1290\u1343\u1295 \u121b\u12d5\u122a\u1295 \u12a5\u12ee\u121d\u1361\u1361 \u121d\u1235\u1275\u12cd\u12d3\u120d\u1295 \u1215\u120d\u1293\u1295 \u12dd\u1270\u12d3\u12f0\u120e\u121d \u1265\u121d\u12c3\u1296\u121d \u1295\u1215\u12f5\u1215\u12f6\u121d \u1265\u1215\u12cd\u1290\u1273\u12ca \u1218\u1295\u1348\u1235 \u12ad\u1270\u1213\u120b\u1208\u12e9 \u12a6\u1208\u12ce\u121d\u1361\u1361", + "text": "Tigrigna ብመንፅር ክብርን መሰልን ኩሎም ሰባት እንትውለዱ ነፃን ማዕሪን እዮም፡፡ ምስትውዓልን ሕልናን ዝተዓደሎም ብምዃኖም ንሕድሕዶም ብሕውነታዊ መንፈስ ክተሓላለዩ ኦለዎም፡፡", "metadata": { "filetype": "text/plain", "data_source": { @@ -10045,7 +10045,7 @@ { "type": "NarrativeText", "element_id": "dce66eb1491ee0e05782cd7b4060bdf1", - "text": "Toba 'Enauac na naaxat shi\u1ef9axauapi na mayipi huesochiguii qataq 'eeta'a't da l'amaqchic qataq da 'enec qataq \u1ef9ataqta \u1ef9a\u1ef9ate'n naua lataxaco qataq nua no'o'n nvil\u1ef9axaco, qaq \u1ef9oqo'oyi iuen da i 'oonolec \u1ef9ataqta itauan ichoxoden ca l\u1ef9a", + "text": "Toba 'Enauac na naaxat shiỹaxauapi na mayipi huesochiguii qataq 'eeta'a't da l'amaqchic qataq da 'enec qataq ỹataqta ỹaỹate'n naua lataxaco qataq nua no'o'n nvilỹaxaco, qaq ỹoqo'oyi iuen da i 'oonolec ỹataqta itauan ichoxoden ca lỹa", "metadata": { "languages": [ "som", @@ -10067,7 +10067,7 @@ { "type": "NarrativeText", "element_id": "d4b675c94f0bd52682c828f5060488a5", - "text": "Tojolabal Spetsanal ja swinkil ja lu\u2019um k\u2019inali junxta wax jul schonjel, sok ja sijpanub\u2019ali, ja yuj ojni b\u2019ob\u2019 sk\u2019u\u2019luk ja jas sk\u2019ana-i ja b\u2019as lekilali, ja yuj ja ay sk\u2019ujoli sok ay spensari t\u2019ilan oj yilsb\u2019aje lek sok ja smoj jumasa.", + "text": "Tojolabal Spetsanal ja swinkil ja lu’um k’inali junxta wax jul schonjel, sok ja sijpanub’ali, ja yuj ojni b’ob’ sk’u’luk ja jas sk’ana-i ja b’as lekilali, ja yuj ja ay sk’ujoli sok ay spensari t’ilan oj yilsb’aje lek sok ja smoj jumasa.", "metadata": { "languages": [ "slv", @@ -10135,7 +10135,7 @@ { "type": "NarrativeText", "element_id": "11c1506a0e4eb0a3616787ebc32828da", - "text": "Tongan Ko e kotoa \u2018o ha\u2019a tangata \u2018oku fanau\u2019i mai \u2018oku tau\u2019ataina pea tatau \u2018i he ngeia mo e ngaahi totonu. Na\u2019e fakanaunau\u2019i kinautolu \u2018aki \u2018a e \u2018atamai mo e konisenisi pea \u2018oku totonu ke nau feohi \u2018i he laumalie \u2018o e nofo fakatautehina.", + "text": "Tongan Ko e kotoa ‘o ha’a tangata ‘oku fanau’i mai ‘oku tau’ataina pea tatau ‘i he ngeia mo e ngaahi totonu. Na’e fakanaunau’i kinautolu ‘aki ‘a e ‘atamai mo e konisenisi pea ‘oku totonu ke nau feohi ‘i he laumalie ‘o e nofo fakatautehina.", "metadata": { "languages": [ "swa", @@ -10180,7 +10180,7 @@ { "type": "NarrativeText", "element_id": "49ac7c418a1a33c64e2c3e228669acea", - "text": "Tsonga (Mozambique) Vanhu hin'kwavu va psaliwili na va khululek\u00ecle, funthsi va fana hi lisima ni tinfaneno. V\u00e0 psaliwili ni nyiko ya ku pimisa ni ku yehleketa; hi kolahu, va fanela ku hanya hi moya wa umb\u00eclu ni unghani.", + "text": "Tsonga (Mozambique) Vanhu hin'kwavu va psaliwili na va khululekìle, funthsi va fana hi lisima ni tinfaneno. Và psaliwili ni nyiko ya ku pimisa ni ku yehleketa; hi kolahu, va fanela ku hanya hi moya wa umbìlu ni unghani.", "metadata": { "languages": [ "swa" @@ -10245,7 +10245,7 @@ { "type": "NarrativeText", "element_id": "3ecfed863a5eed35ac7bcdc4f1ebcf6d", - "text": "Turkish B\u00fct\u00fcn insanlar h\u00fcr, haysiyet ve haklar bak\u0131m\u0131ndan e\u015fit do\u011farlar. Ak\u0131l ve vicdana sahiptirler ve birbirlerine kar\u015f\u0131 karde\u015flik zihniyeti ile hareket etmelidirler.", + "text": "Turkish Bütün insanlar hür, haysiyet ve haklar bakımından eşit doğarlar. Akıl ve vicdana sahiptirler ve birbirlerine karşı kardeşlik zihniyeti ile hareket etmelidirler.", "metadata": { "languages": [ "tur" @@ -10266,7 +10266,7 @@ { "type": "NarrativeText", "element_id": "ec6b4429d4b16c9725f0f1420314a928", - "text": "Turkmen (Cyrillic) \u0425\u0435\u043c\u043c\u0435 \u0430\u0434\u0430\u043c\u043b\u0430\u0440 \u04e9\u0437 \u043c\u0435\u0440\u0442\u0435\u0431\u0435\u0441\u0438 \u0432\u0435 \u0445\u0443\u043a\u0443\u043a\u043b\u0430\u0440\u044b \u0431\u043e\u044e\u043d\u0447\u0430 \u0434\u0435\u04a3 \u044f\u0433\u0434\u0430\u0439\u0434\u0430 \u0434\u04af\u043d\u0439\u04d9 \u0438\u043d\u0439\u04d9\u0440\u043b\u0435\u0440. \u041e\u043b\u0430\u0440\u0430 \u0430\u04a3 \u0445\u0435\u043c \u0432\u044b\u0497\u0434\u0430\u043d \u0431\u0435\u0440\u043b\u0435\u043d\u0434\u0438\u0440 \u0432\u0435 \u043e\u043b\u0430\u0440 \u0431\u0438\u0440\u2010\u0431\u0438\u0440\u043b\u0435\u0440\u0438 \u0431\u0438\u043b\u0435\u043d \u0434\u043e\u0433\u0430\u043d\u043b\u044b\u043a \u0440\u0443\u0445\u0443\u043d\u0434\u0430\u043a\u044b \u0433\u0430\u0440\u0430\u0439\u044b\u0448\u0434\u0430 \u0431\u043e\u043b\u043c\u0430\u043b\u044b\u0434\u044b\u0440\u043b\u0430\u0440.", + "text": "Turkmen (Cyrillic) Хемме адамлар өз мертебеси ве хукуклары боюнча дең ягдайда дүнйә инйәрлер. Олара аң хем выҗдан берлендир ве олар бир‐бирлери билен доганлык рухундакы гарайышда болмалыдырлар.", "metadata": { "languages": [ "rus" @@ -10287,7 +10287,7 @@ { "type": "NarrativeText", "element_id": "27683edb29bca811bea3008052c0fc9f", - "text": "Turkmen (Latin) Adamlary\u0148 hemmesi azat dogul\u00fdarlar we \u00f6z mertebesi hem\u2010de hukuklary bo\u00fdun\u00e7a ilkiba\u015fdan de\u0148dirler. Olara ozal\u2010ba\u015fdan a\u0148, ynsap berlendir we biri\u2010birine \u00f6zara doganlyk ruhunda \u00e7emele\u015fmek olary\u0148 \u00fdara\u015fygydyr.", + "text": "Turkmen (Latin) Adamlaryň hemmesi azat dogulýarlar we öz mertebesi hem‐de hukuklary boýunça ilkibaşdan deňdirler. Olara ozal‐başdan aň, ynsap berlendir we biri‐birine özara doganlyk ruhunda çemeleşmek olaryň ýaraşygydyr.", "metadata": { "languages": [ "tur" @@ -10308,7 +10308,7 @@ { "type": "NarrativeText", "element_id": "6b9f05c9e0fdf0e6de36b54f1c82f5d0", - "text": "Tuva \u0411\u04af\u0433\u04af \u043a\u0438\u0436\u0438\u043b\u0435\u0440 \u0445\u043e\u0441\u0442\u0443\u0433 \u0431\u0430\u0437\u0430 \u043c\u04e9\u0437\u04af\u0437\u04af \u0431\u043e\u043b\u0433\u0430\u0448 \u044d\u0440\u0433\u0435\u043b\u0435\u0440\u0438 \u0434\u0435\u04a3 \u043a\u044b\u043b\u0434\u044b\u0440 \u0442\u04e9\u0440\u04af\u0442\u0442\u04af\u043d\u0435\u0440. \u041e\u043b\u0430\u0440\u0433\u0430 \u0443\u0433\u0430\u0430\u043d\u0441\u0430\u0440\u044b\u044b\u043b \u0431\u043e\u043b\u0433\u0430\u0448 \u0430\u0440\u044b\u043d-\u043d\u04af\u04af\u0440 \u0431\u0435\u0440\u0434\u0438\u043d\u0433\u0435\u043d \u0431\u043e\u043b\u0443\u0440 \u0431\u043e\u043b\u0433\u0430\u0448 \u043e\u043b\u0430\u0440 \u0431\u043e\u0442-\u0431\u043e\u0442\u0442\u0430\u0440\u044b\u043d\u0433\u0430 \u0430\u043a\u044b-\u0434\u0443\u04a3\u043c\u0430\u043b\u044b\u0448\u043a\u044b \u0445\u0430\u043c\u0430\u0430\u0440\u044b\u043b\u0433\u0430\u043d\u044b \u043a\u04e9\u0440\u0433\u04af\u0437\u0435\u0440 \u0443\u0436\u0443\u0440\u043b\u0443\u0433.", + "text": "Tuva Бүгү кижилер хостуг база мөзүзү болгаш эргелери дең кылдыр төрүттүнер. Оларга угаансарыыл болгаш арын-нүүр бердинген болур болгаш олар бот-боттарынга акы-дуңмалышкы хамаарылганы көргүзер ужурлуг.", "metadata": { "languages": [ "rus" @@ -10329,7 +10329,7 @@ { "type": "NarrativeText", "element_id": "527f7d8b2d19b7c6c3f2fadc70ada262", - "text": "Twi (Akuapem) W\u0254awo adesamma nyinaa s\u025b nnipa a w\u0254w\u0254 ahofadi. W\u0254n nyinaa w\u0254 nidi ne ky\u025bfa koro. W\u0254w\u0254 adwene ne ahonim, na \u025bs\u025b s\u025b wobu w\u0254n ho w\u0254n ho s\u025b anuanom.", + "text": "Twi (Akuapem) Wɔawo adesamma nyinaa sɛ nnipa a wɔwɔ ahofadi. Wɔn nyinaa wɔ nidi ne kyɛfa koro. Wɔwɔ adwene ne ahonim, na ɛsɛ sɛ wobu wɔn ho wɔn ho sɛ anuanom.", "metadata": { "languages": [ "swa", @@ -10351,7 +10351,7 @@ { "type": "NarrativeText", "element_id": "aefbdde1da2ecc73208751b3c330bb3e", - "text": "Twi (Asante) Nnipa nyinaa y\u025b p\u025b. Na w\u0254de adwene ne nyansa na ab\u0254 obiara. \u0190no nti, \u025bs\u025b s\u025b obiara d\u0254 ne y\u0254nko, bu ne y\u0254nko, di ne y\u0254nko ni.", + "text": "Twi (Asante) Nnipa nyinaa yɛ pɛ. Na wɔde adwene ne nyansa na abɔ obiara. Ɛno nti, ɛsɛ sɛ obiara dɔ ne yɔnko, bu ne yɔnko, di ne yɔnko ni.", "metadata": { "languages": [ "swa", @@ -10373,7 +10373,7 @@ { "type": "NarrativeText", "element_id": "4b0bd8eaae3f12feed9188c010027eb7", - "text": "Tzeltal, Oxchuc Spisil winiketik te ya xbejk\u00b4ajik ta k\u00b4inalil ay jrerechotik, mayuk mach\u00b4a chukul ya xbejka, ya jnatik stojol te jpisiltik ay snopibal sok sbijil joltik, ja\u00b4 me k\u00b4ux ya kaibatik ta jujun tul.", + "text": "Tzeltal, Oxchuc Spisil winiketik te ya xbejk´ajik ta k´inalil ay jrerechotik, mayuk mach´a chukul ya xbejka, ya jnatik stojol te jpisiltik ay snopibal sok sbijil joltik, ja´ me k´ux ya kaibatik ta jujun tul.", "metadata": { "languages": [ "ind", @@ -10395,7 +10395,7 @@ { "type": "NarrativeText", "element_id": "3a1d8b7b6302ae4de3c1c05a5c4f8fc7", - "text": "Tzotzil (Chamula) Skotol vinik o ants ta spejel balumile k\u2019olem x-hayan i ko\u2019ol ta sch\u2019ulal i sderechoetik i, skotol k\u2019ux-elan oyike oy srasonik y slekilalik, sventa skuxijik lekn\u00f3o ta ju jun ju ju vo.", + "text": "Tzotzil (Chamula) Skotol vinik o ants ta spejel balumile k’olem x-hayan i ko’ol ta sch’ulal i sderechoetik i, skotol k’ux-elan oyike oy srasonik y slekilalik, sventa skuxijik leknóo ta ju jun ju ju vo.", "metadata": { "languages": [ "hrv", @@ -10418,7 +10418,7 @@ { "type": "NarrativeText", "element_id": "9c8ce1a1d4b031909f2b8d5c31bc3084", - "text": "Uduk Aris \u2019kwaniny\u2019ceshi \u2019baar mo dho\u2019thkunu \u2019ba\u1e35any mo dhali mmomiiya \u1e6fu\u2019c imon\u1e6fal \u2019de/ mo dhali mii ma \u1e35ar/e mo. Uni mini ta gi gwo mo dhali mii mo dhali uni mini mii ka karambuye/ \u2019kup\u0331 ki cin tiya mo e shi/in mo dhali mii kun tanu ikam mo.", + "text": "Uduk Aris ’kwaniny’ceshi ’baar mo dho’thkunu ’baḵany mo dhali mmomiiya ṯu’c imonṯal ’de/ mo dhali mii ma ḵar/e mo. Uni mini ta gi gwo mo dhali mii mo dhali uni mini mii ka karambuye/ ’kup̱ ki cin tiya mo e shi/in mo dhali mii kun tanu ikam mo.", "metadata": { "languages": [ "swa", @@ -10440,7 +10440,7 @@ { "type": "NarrativeText", "element_id": "35ad852b028b17863397cd23a741e776", - "text": "Ukrainian \u0412\u0441\u0456 \u043b\u044e\u0434\u0438 \u043d\u0430\u0440\u043e\u0434\u0436\u0443\u044e\u0442\u044c\u0441\u044f \u0432\u0456\u043b\u044c\u043d\u0438\u043c\u0438 \u0456 \u0440\u0456\u0432\u043d\u0438\u043c\u0438 \u0443 \u0441\u0432\u043e\u0457\u0439 \u0433\u0456\u0434\u043d\u043e\u0441\u0442\u0456 \u0442\u0430 \u043f\u0440\u0430\u0432\u0430\u0445. \u0412\u043e\u043d\u0438 \u043d\u0430\u0434\u0456\u043b\u0435\u043d\u0456 \u0440\u043e\u0437\u0443\u043c\u043e\u043c \u0456 \u0441\u043e\u0432\u0456\u0441\u0442\u044e \u0456 \u043f\u043e\u0432\u0438\u043d\u043d\u0456 \u0434\u0456\u044f\u0442\u0438 \u0443 \u0432\u0456\u0434\u043d\u043e\u0448\u0435\u043d\u043d\u0456 \u043e\u0434\u0438\u043d \u0434\u043e \u043e\u0434\u043d\u043e\u0433\u043e \u0432 \u0434\u0443\u0441\u0456 \u0431\u0440\u0430\u0442\u0435\u0440\u0441\u0442\u0432\u0430.", + "text": "Ukrainian Всі люди народжуються вільними і рівними у своїй гідності та правах. Вони наділені розумом і совістю і повинні діяти у відношенні один до одного в дусі братерства.", "metadata": { "languages": [ "ukr" @@ -10461,7 +10461,7 @@ { "type": "NarrativeText", "element_id": "2da70f2c0e7850d3cb64606cb0479fc9", - "text": "Umbundu Omanu vosi vacitiwa valipwa kwenda valisoka kovina vyosikwenda komoko. Ovo vakwete esunga kwenda, kwenda olondunge kwenje ovo vat\u00eala okuliteywila kuvamwe kwenda vakwavo vesokolwilo lyocisola.", + "text": "Umbundu Omanu vosi vacitiwa valipwa kwenda valisoka kovina vyosikwenda komoko. Ovo vakwete esunga kwenda, kwenda olondunge kwenje ovo vatêla okuliteywila kuvamwe kwenda vakwavo vesokolwilo lyocisola.", "metadata": { "languages": [ "swa", @@ -10529,7 +10529,7 @@ { "type": "UncategorizedText", "element_id": "17e2b5b5c80c984c98843bbed39884c4", - "text": "Urdu \u062a\u0645\u0627\u0645 \u0627\u0646\u0633\u0627\u0646 \u0622\u0632\u0627\u062f \u0627\u0648\u0631 \u062d\u0642\u0648\u0642 \u0648 \u0639\u0632\u062a \u06a9\u06d2 \u0627\u0639\u062a\u0628\u0627\u0631 \u0633\u06d2 \u0628\u0631\u0627\u0628\u0631 \u067e\u06cc\u062f\u0627 \u06c1\u0648\u0626\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0627\u0646\u06c1\u06cc\u06ba \u0636\u0645\u06cc\u0631 \u0627\u0648\u0631 \u0639\u0642\u0644 \u0648\u062f\u06cc\u0639\u062a \u06c1\u0648\u0626\u06cc \u06c1\u06d2\u06d4 \u0627\u0633 \u0644\u0626\u06d2 \u0627\u0646\u06c1\u06cc\u06ba \u0627\u06cc\u06a9 \u062f\u0648\u0633\u0631\u06d2 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0628\u06be\u0627\u0626\u06cc \u0686\u0627\u0631\u06d2 \u06a9\u0627 \u0633\u0644\u0648\u06a9 \u06a9\u0631\u0646\u0627 \u0686\u0627\u06c1\u06cc\u0626\u06d2\u06d4", + "text": "Urdu تمام انسان آزاد اور حقوق و عزت کے اعتبار سے برابر پیدا ہوئے ہیں۔ انہیں ضمیر اور عقل ودیعت ہوئی ہے۔ اس لئے انہیں ایک دوسرے کے ساتھ بھائی چارے کا سلوک کرنا چاہیئے۔", "metadata": { "languages": [ "urd" @@ -10550,7 +10550,7 @@ { "type": "UncategorizedText", "element_id": "64062747e4a49e81a0ff7fe76c935f92", - "text": "Urdu (2) \u062a\u0645\u0627\u0645 \u0627\u0646\u0633\u0627\u0646 \u0622\u0632\u0627\u062f \u0627\u0648\u0631 \u062d\u0642\u0648\u0642 \u0648 \u0639\u0632\u062a \u06a9\u06d2 \u0627\u0639\u062a\u0628\u0627\u0631 \u0633\u06d2 \u0628\u0631\u0627\u0628\u0631 \u067e\u06cc\u062f\u0627 \u06c1\u0648\u0626\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0627\u0646\u06c1\u06cc\u06ba \u0636\u0645\u06cc\u0631 \u0627\u0648\u0631 \u0639\u0642\u0644 \u0648\u062f\u06cc\u0639\u062a \u06c1\u0648\u0626\u06cc \u06c1\u06d2\u06d4 \u0627\u0633 \u0644\u06cc\u06d2 \u0627\u0646\u06c1\u06cc\u06ba \u0627\u06cc\u06a9 \u062f\u0648\u0633\u0631\u06d2 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0628\u06be\u0627\u0626\u06cc \u0686\u0627\u0631\u06d2 \u06a9\u0627 \u0633\u0644\u0648\u06a9 \u06a9\u0631\u0646\u0627 \u0686\u0627\u06c1\u06cc\u06d2\u06d4", + "text": "Urdu (2) تمام انسان آزاد اور حقوق و عزت کے اعتبار سے برابر پیدا ہوئے ہیں۔ انہیں ضمیر اور عقل ودیعت ہوئی ہے۔ اس لیے انہیں ایک دوسرے کے ساتھ بھائی چارے کا سلوک کرنا چاہیے۔", "metadata": { "languages": [ "urd" @@ -10571,7 +10571,7 @@ { "type": "NarrativeText", "element_id": "c0f369076ccc7b4f6949b46f78e9c721", - "text": "Uyghur (Arabic) \u06be\u06d5\u0645\u0645\u06d5 \u0626\u0627\u062f\u06d5\u0645 \u0632\u0627\u0646\u0649\u062f\u0649\u0646\u0644\u0627 \u0626\u06d5\u0631\u0643\u0649\u0646\u060c \u0626\u0649\u0632\u0632\u06d5\u062a-\u06be\u06c6\u0631\u0645\u06d5\u062a \u06cb\u06d5 \u06be\u0648\u0642\u06c7\u0642\u062a\u0627 \u0628\u0627\u067e\u0628\u0627\u0631\u0627\u06cb\u06d5\u0631 \u0628\u0648\u0644\u06c7\u067e \u062a\u06c7\u063a\u06c7\u0644\u063a\u0627\u0646. \u0626\u06c7\u0644\u0627\u0631 \u0626\u06d5\u0642\u0649\u0644\u063a\u06d5 \u06cb\u06d5 \u06cb\u0649\u062c\u062f\u0627\u0646\u063a\u0627 \u0626\u0649\u06af\u06d5 \u06be\u06d5\u0645\u062f\u06d5 \u0628\u0649\u0631-\u0628\u0649\u0631\u0649\u06af\u06d5 \u0642\u06d0\u0631\u0649\u0646\u062f\u0627\u0634\u0644\u0649\u0642 \u0645\u06c7\u0646\u0627\u0633\u0649\u06cb\u0649\u062a\u0649\u06af\u06d5 \u062e\u0627\u0633 \u0631\u0648\u06be \u0628\u0649\u0644\u06d5\u0646 \u0645\u0648\u0626\u0627\u0645\u0649\u0644\u06d5 \u0642\u0649\u0644\u0649\u0634\u0649 \u0643\u06d0\u0631\u06d5\u0643.", + "text": "Uyghur (Arabic) ھەممە ئادەم زانىدىنلا ئەركىن، ئىززەت-ھۆرمەت ۋە ھوقۇقتا باپباراۋەر بولۇپ تۇغۇلغان. ئۇلار ئەقىلغە ۋە ۋىجدانغا ئىگە ھەمدە بىر-بىرىگە قېرىنداشلىق مۇناسىۋىتىگە خاس روھ بىلەن موئامىلە قىلىشى كېرەك.", "metadata": { "languages": [ "ara" @@ -10592,7 +10592,7 @@ { "type": "NarrativeText", "element_id": "c9695addaae400cf93180490aae4c5b8", - "text": "Uyghur (Latin) hemme adem zatidinla erkin, izzet-h\u00f6rmet we hoquqta babbarawer bolup tughulghan. ular eqilghe we wijdan'gha ige hemde bir-birige q\u00e9rindashliq munasiwitige xas roh bilen muamile qilishi k\u00e9rek.", + "text": "Uyghur (Latin) hemme adem zatidinla erkin, izzet-hörmet we hoquqta babbarawer bolup tughulghan. ular eqilghe we wijdan'gha ige hemde bir-birige qérindashliq munasiwitige xas roh bilen muamile qilishi kérek.", "metadata": { "languages": [ "nld", @@ -10616,7 +10616,7 @@ { "type": "NarrativeText", "element_id": "cf037543ae7e29089220134bd8d9fc80", - "text": "Uzbek, Northern (Cyrillic) \u0411\u0430\u0440\u0447\u0430 \u043e\u0434\u0430\u043c\u043b\u0430\u0440 \u044d\u0440\u043a\u0438\u043d, \u049b\u0430\u0434\u0440\u2010\u049b\u0438\u043c\u043c\u0430\u0442 \u0432\u0430 \u04b3\u0443\u049b\u0443\u049b\u043b\u0430\u0440\u0434\u0430 \u0442\u0435\u043d\u0433 \u0431\u045e\u043b\u0438\u0431 \u0442\u0443\u0493\u0438\u043b\u0430\u0434\u0438\u043b\u0430\u0440. \u0423\u043b\u0430\u0440 \u0430\u049b\u043b \u0432\u0430 \u0432\u0438\u0436\u0434\u043e\u043d \u0441\u043e\u04b3\u0438\u0431\u0438\u0434\u0438\u0440\u043b\u0430\u0440 \u0432\u0430 \u0431\u0438\u0440\u2010\u0431\u0438\u0440\u043b\u0430\u0440\u0438\u0433\u0430 \u0431\u0438\u0440\u043e\u0434\u0430\u0440\u043b\u0430\u0440\u0447\u0430 \u043c\u0443\u043e\u043c\u0430\u043b\u0430 \u049b\u0438\u043b\u0438\u0448\u043b\u0430\u0440\u0438 \u0437\u0430\u0440\u0443\u0440.", + "text": "Uzbek, Northern (Cyrillic) Барча одамлар эркин, қадр‐қиммат ва ҳуқуқларда тенг бўлиб туғиладилар. Улар ақл ва виждон соҳибидирлар ва бир‐бирларига биродарларча муомала қилишлари зарур.", "metadata": { "languages": [ "mkd" @@ -10637,7 +10637,7 @@ { "type": "NarrativeText", "element_id": "f96f007fae71f3dbb5cf107a67339f62", - "text": "Uzbek, Northern (Latin) Barcha odamlar erkin, qadr\u2010qimmat va huquqlarda teng bo\u02bblib tug\u02bbiladilar. Ular aql va vijdon sohibidirlar va bir\u2010birlariga birodarlarcha muomala qilishlari zarur.", + "text": "Uzbek, Northern (Latin) Barcha odamlar erkin, qadr‐qimmat va huquqlarda teng boʻlib tugʻiladilar. Ular aql va vijdon sohibidirlar va bir‐birlariga birodarlarcha muomala qilishlari zarur.", "metadata": { "languages": [ "tur", @@ -10659,7 +10659,7 @@ { "type": "NarrativeText", "element_id": "4309a801882998d4a87ec4393c62eb5b", - "text": "Vai \ua549\ua55c\ua56e \ua514\ua60b \ua5b8 \ua530 \ua5cb\ua60b \ua56e\ua568 \ua514\ua60b \ua5b8 \ua54e \ua549\ua5b8\ua54a \ua574\ua583 \ua543\ua524\ua602 \ua5f1, \ua549\ua5b7 \ua5ea\ua5e1 \ua53b\ua524 \ua5cf\ua5d2\ua5e1 \ua54e \ua5ea \ua549\ua5b8\ua54a \ua58f\ua54e. \ua549\ua561 \ua58f \ua5f3\ua56e\ua54a \ua5cf \ua56a \ua5d3 \ua549\ua5b7 \ua549\ua5b8 \ua558\ua55e \ua5ea. \ua58f\ua5b7 \ua549\ua5b8\ua527 \ua58f \ua5b8 \ua55a\ua54c\ua602 \ua5f7\ua524 \ua55e \ua603\ua5b7 \ua609\ua527 \ua5e0\ua5bb \ua55e \ua5b4\ua60b \ua533\ua569 \ua549\ua5b8 \ua5f3.", + "text": "Vai ꕉꕜꕮ ꔔꘋ ꖸ ꔰ ꗋꘋ ꕮꕨ ꔔꘋ ꖸ ꕎ ꕉꖸꕊ ꕴꖃ ꕃꔤꘂ ꗱ, ꕉꖷ ꗪꗡ ꔻꔤ ꗏꗒꗡ ꕎ ꗪ ꕉꖸꕊ ꖏꕎ. ꕉꕡ ꖏ ꗳꕮꕊ ꗏ ꕪ ꗓ ꕉꖷ ꕉꖸ ꕘꕞ ꗪ. ꖏꖷ ꕉꖸꔧ ꖏ ꖸ ꕚꕌꘂ ꗷꔤ ꕞ ꘃꖷ ꘉꔧ ꗠꖻ ꕞ ꖴꘋ ꔳꕩ ꕉꖸ ꗳ.", "metadata": { "filetype": "text/plain", "data_source": { @@ -10677,7 +10677,7 @@ { "type": "NarrativeText", "element_id": "8874ff5275f95f22ade2d05b19b84596", - "text": "Venda Vhathu vho\u1e71he vha bebwa vhe na mbofholowo nahone vha tshi lingana siani \u1e3da tshirunzi na pfanelo. Vhathu vho\u1e71he vho \u1e4bewa mihumbulo na mvalo ngauralo vha tea u konou farana sa vhathu vhathihi.", + "text": "Venda Vhathu vhoṱhe vha bebwa vhe na mbofholowo nahone vha tshi lingana siani ḽa tshirunzi na pfanelo. Vhathu vhoṱhe vho ṋewa mihumbulo na mvalo ngauralo vha tea u konou farana sa vhathu vhathihi.", "metadata": { "languages": [ "swa" @@ -10698,7 +10698,7 @@ { "type": "NarrativeText", "element_id": "1b4e3e7ad00ef96ec0938e98c22ac4d7", - "text": "Venda Vhathu vho\u1e71he vha bebwa vhe na mbofholowo nahone vha tshi lingana siani \u1e3da tshirunzi na pfanelo. Vhathu vho\u1e71he vho \u1e4bewa mihumbulo na mvalo ngauralo vha tea u konou farana sa vhathu vhathihi.", + "text": "Venda Vhathu vhoṱhe vha bebwa vhe na mbofholowo nahone vha tshi lingana siani ḽa tshirunzi na pfanelo. Vhathu vhoṱhe vho ṋewa mihumbulo na mvalo ngauralo vha tea u konou farana sa vhathu vhathihi.", "metadata": { "languages": [ "swa" @@ -10719,7 +10719,7 @@ { "type": "NarrativeText", "element_id": "57f8d88a5300439c2e78d95d9954dd1b", - "text": "Venetian Tuti i \u00e8sari umani i nase \u0142\u00ecbari e conpanji par dinjit\u00e0 e deriti. I ze dot\u00e0i de rajon e de cosiensa e i ga da conportarse intr\u00e0 de \u0142ori co sp\u00ecrito de frade\u0142i.", + "text": "Venetian Tuti i èsari umani i nase łìbari e conpanji par dinjità e deriti. I ze dotài de rajon e de cosiensa e i ga da conportarse intrà de łori co spìrito de fradełi.", "metadata": { "languages": [ "ita", @@ -10741,7 +10741,7 @@ { "type": "NarrativeText", "element_id": "bde94a10001841ef9fad0f19311e6fa9", - "text": "Veps Kaik mehed su\u0308nduba joudajin i kohtai\u017ein, u\u0308hteji\u010d\u010din i\u010deze arvokahudes i oiktusi\u0161. Heile om anttud mel\u2019 i huiktusentund i heile tari\u017e ko\u017euda toine toi\u017eenke kut vel\u2019l\u2019kundad.", + "text": "Veps Kaik mehed sünduba joudajin i kohtaižin, ühtejiččin ičeze arvokahudes i oiktusiš. Heile om anttud mel’ i huiktusentund i heile tariž kožuda toine toiženke kut vel’l’kundad.", "metadata": { "languages": [ "est", @@ -10763,7 +10763,7 @@ { "type": "NarrativeText", "element_id": "c6836fc94a9a2261da5605eae88ea21f", - "text": "Vietnamese T\u00e2\u0301t ca\u0309 mo\u0323i ng\u01b0\u01a1\u0300i sinh ra \u0111\u00ea\u0300u \u0111\u01b0\u01a1\u0323c t\u01b0\u0323 do va\u0300 bi\u0300nh \u0111\u0103\u0309ng v\u00ea\u0300 nh\u00e2n ph\u00e2\u0309m va\u0300 quy\u00ea\u0300n. Mo\u0323i con ng\u01b0\u01a1\u0300i \u0111\u00ea\u0300u \u0111\u01b0\u01a1\u0323c ta\u0323o hoa\u0301 ban cho ly\u0301 tri\u0301 va\u0300 l\u01b0\u01a1ng t\u00e2m va\u0300 c\u00e2\u0300n pha\u0309i \u0111\u00f4\u0301i x\u01b0\u0309 v\u01a1\u0301i nhau trong ti\u0300nh b\u0103\u0300ng h\u01b0\u0303u.", + "text": "Vietnamese Tất cả mọi người sinh ra đều được tự do và bình đẳng về nhân phẩm và quyền. Mọi con người đều được tạo hoá ban cho lý trí và lương tâm và cần phải đối xử với nhau trong tình bằng hữu.", "metadata": { "languages": [ "vie" @@ -10805,7 +10805,7 @@ { "type": "Title", "element_id": "294055dfb0c1131395070d727e81fde6", - "text": "\u7562\u54ff\u6bcf\ud840\ude9b\u751f\ud841\udea2\u8abf\u5f97\u81ea\u7531\u5427\u5e73\u7b49\ud85d\uddf1\u4eba\u54c1\u5427\u6b0a\u3002\u6bcf\ud846\udd75\ud840\ude9b\u8abf\u5f97\u9020\u5316\u9812\u6731\u7406\u667a\u5427\u826f\u5fc3\u5427\u52e4\u6c9b\u5c0d\u8655\ud84a\udf72\u81ae\ud856\ude9d\u60c5\u670b\u53cb\u3002", + "text": "畢哿每𠊛生𠚢調得自由吧平等𧗱人品吧權。每𡥵𠊛調得造化頒朱理智吧良心吧勤沛對處𢭲膮𥪝情朋友。", "metadata": { "languages": [ "kor", @@ -10827,7 +10827,7 @@ { "type": "NarrativeText", "element_id": "4ab64de143568003ad62ca2cf3c8cda3", - "text": "Waama Yiriba na b\u00e0 sikindo dare b\u00e0 m\u025b\u025bri, da seena yirimma mii b\u00e0 ta da i n\u025bki b\u00e0 t\u0254\u0254ba.", + "text": "Waama Yiriba na bà sikindo dare bà mɛɛri, da seena yirimma mii bà ta da i nɛki bà tɔɔba.", "metadata": { "languages": [ "som", @@ -10849,7 +10849,7 @@ { "type": "NarrativeText", "element_id": "88700f6c9f719c0f7ad537b0fe24d46d", - "text": "Walloon Tos l\u00e8s-omes vin\u00e8t-st-\u00e5 monde l\u00eebes, \u00e8t so-l'minme p\u00eed po \u00e7ou qu'\u00e8nn'\u00e8st d'leu dignit\u00e9 \u00e8t d'leus dre\u00fbts. I n'sont nin fo\u00fb r\u00eazon \u00e8t-z-ont-i le\u00fb consyince po z\u00e8ls, \u00e7ou qu'\u00e8lz\u00e8s de\u00fbt miner a s'kid\u00fbre onk' po l'\u00f4te tot come d\u00e8s fr\u00e9s.", + "text": "Walloon Tos lès-omes vinèt-st-å monde lîbes, èt so-l'minme pîd po çou qu'ènn'èst d'leu dignité èt d'leus dreûts. I n'sont nin foû rêzon èt-z-ont-i leû consyince po zèls, çou qu'èlzès deût miner a s'kidûre onk' po l'ôte tot come dès frés.", "metadata": { "languages": [ "fra" @@ -10913,7 +10913,7 @@ { "type": "NarrativeText", "element_id": "25c9bb862536e9e520792ea8724608de", - "text": "Wayuu Naa wayuukana jemeishi s\u00fcp\u00fcla taashi s\u00fcma wanawa s\u00fclu'u nakua'ipa, aka m\u00fcin yaa epijainjana s\u00fcnain anajiranawaa a'in nama nap\u00fcshi.", + "text": "Wayuu Naa wayuukana jemeishi süpüla taashi süma wanawa sülu'u nakua'ipa, aka müin yaa epijainjana sünain anajiranawaa a'in nama napüshi.", "metadata": { "languages": [ "swa" @@ -10934,7 +10934,7 @@ { "type": "NarrativeText", "element_id": "b4265fbb8924aeeb84569e7b2e4e3197", - "text": "Welsh Genir pawb yn rhydd ac yn gydradd \u00e2\u2019i gilydd mewn urddas a hawliau. Fe\u2019u cynysgaeddir \u00e2 rheswm a chydwybod, a dylai pawb ymddwyn y naill at y llall mewn ysbryd cymodlon.", + "text": "Welsh Genir pawb yn rhydd ac yn gydradd â’i gilydd mewn urddas a hawliau. Fe’u cynysgaeddir â rheswm a chydwybod, a dylai pawb ymddwyn y naill at y llall mewn ysbryd cymodlon.", "metadata": { "languages": [ "cym" @@ -10955,7 +10955,7 @@ { "type": "NarrativeText", "element_id": "8799ac3c8264dbd02b24e5484e28ea2d", - "text": "Wolof Doomi aadama y\u00e9pp danuy juddu, yam ci tawfeex ci sag ak sa\u00f1-sa\u00f1. Nekk na it ku xam d\u00ebgg te \u00e0nd na ak xelam, te war naa j\u00ebflante ak nawleen, te teg ko ci w\u00e0llu mbokk.", + "text": "Wolof Doomi aadama yépp danuy juddu, yam ci tawfeex ci sag ak sañ-sañ. Nekk na it ku xam dëgg te ànd na ak xelam, te war naa jëflante ak nawleen, te teg ko ci wàllu mbokk.", "metadata": { "languages": [ "ind", @@ -11022,7 +11022,7 @@ { "type": "NarrativeText", "element_id": "b1da3b28878be3ee9c9045f0c9223c84", - "text": "Yakut \u0414\u044c\u043e\u043d \u0431\u0430\u0440\u044b\u0442\u0430 \u0431\u044d\u0439\u044d \u0441\u0443\u043e\u043b\u0442\u0430\u0442\u044b\u0433\u0430\u0440 \u0443\u043e\u043d\u043d\u0430 \u0431\u044b\u0440\u0430\u0430\u0431\u044b\u0433\u0430\u0440 \u0442\u044d\u04a5 \u0431\u0443\u043e\u043b\u0430\u043d \u0442\u04e9\u0440\u04af\u04af\u043b\u043b\u044d\u0440. \u041a\u0438\u043d\u0438\u043b\u044d\u0440 \u0431\u0430\u0440\u044b \u04e9\u0440\u043a\u04e9\u043d \u04e9\u0439\u0434\u04e9\u04e9\u0445, \u0441\u0443\u043e\u0431\u0430\u0441\u0442\u0430\u0430\u0445 \u0431\u0443\u043e\u043b\u0430\u043d \u0442\u04e9\u0440\u04af\u04af\u043b\u043b\u044d\u0440, \u0443\u043e\u043d\u043d\u0430 \u0431\u044d\u0439\u044d \u0431\u044d\u0439\u044d\u043b\u044d\u0440\u0438\u0433\u044d\u0440 \u0442\u044b\u043b\u0433\u0430 \u043a\u0438\u0438\u0440\u0438\u043d\u0438\u0433\u044d\u0441 \u0431\u044b\u04bb\u044b\u044b\u043b\u0430\u0440\u0430 \u0434\u043e\u0495\u043e\u0440\u0434\u043e\u04bb\u0443\u0443 \u0442\u044b\u044b\u043d\u043d\u0430\u0430\u0445 \u0431\u0443\u043e\u043b\u0443\u043e\u0445\u0442\u0430\u0430\u0445.", + "text": "Yakut Дьон барыта бэйэ суолтатыгар уонна быраабыгар тэҥ буолан төрүүллэр. Кинилэр бары өркөн өйдөөх, суобастаах буолан төрүүллэр, уонна бэйэ бэйэлэригэр тылга кииринигэс быһыылара доҕордоһуу тыыннаах буолуохтаах.", "metadata": { "languages": [ "rus" @@ -11043,7 +11043,7 @@ { "type": "NarrativeText", "element_id": "53f4d4779755796c4b53e9945f211ced", - "text": "Yanesha\u02bc Allohueney \u00f1e\u00f1tey arrom\u00f1atey att\u0303o ye'\u00f1alletyesa arr patsro e'\u00f1e att\u0303ecma cohuen yesherb\u0303a'yen. \u00d1am\u0303a yechyen allpon derechos att\u0303och e'\u00f1ech cohueno'tsa'yeney arr patsro. \u00d1am\u0303a allohuen att\u0303ecma yechyen alloch yoct\u0303ape' chyen cohuen \u00f1am\u0303a ye\u00f1otyen yeyoc\u0308hro \u00f1e\u00f1t \u0303e'ne pocte' enten ache\u00f1enesha' \u00f1am\u0303a \u00f1e\u00f1t \u0303ama pocteye' enteneto. Ye\u00f1ote\u00f1 a\u00f1 poctetsa e'\u00f1e yemo'nashe\u00f1 yep\u0303annena ama't ora allohuen allpon ache\u00f1enesha' \u00f1e\u00f1t \u0303a\u00f1e patsro'tsa'yeney.", + "text": "Yaneshaʼ Allohueney ñeñtey arromñatey att̃o ye'ñalletyesa arr patsro e'ñe att̃ecma cohuen yesherb̃a'yen. Ñam̃a yechyen allpon derechos att̃och e'ñech cohueno'tsa'yeney arr patsro. Ñam̃a allohuen att̃ecma yechyen alloch yoct̃ape' chyen cohuen ñam̃a yeñotyen yeyoc̈hro ñeñt ̃e'ne pocte' enten acheñenesha' ñam̃a ñeñt ̃ama pocteye' enteneto. Yeñoteñ añ poctetsa e'ñe yemo'nasheñ yep̃annena ama't ora allohuen allpon acheñenesha' ñeñt ̃añe patsro'tsa'yeney.", "metadata": { "languages": [ "spa", @@ -11065,7 +11065,7 @@ { "type": "NarrativeText", "element_id": "1484d1c7c562268257922f9f0522d183", - "text": "Yanomam\u00f6 K\u00f5mi th\u00eb p\u00eb r\u00eb p\u00ebripraw\u00eb r\u00eb piy\u00ebk\u00ebi, he usukuw\u00eb th\u00eb p\u00eb keprou ai th\u00eb \u00e3 r\u00ebamaih\u00e3 no \u00e3 heparohow\u00eb, totihitaw\u00eb th\u00eb p\u00eb ri\u00e3 r\u1ebd thaiwehei hami, th\u00eb p\u00eb puhi tao k\u00e3i p\u00ebrihiw\u00ebha, th\u00eb p\u00eb puhi k\u00e3i katehew\u00ebha haw\u00eb kama th\u00eb p\u00eb mashi sh\u0129ro p\u00ebrihimop\u00eb.", + "text": "Yanomamö Kõmi thë pë rë përiprawë rë piyëkëi, he usukuwë thë pë keprou ai thë ã rëamaihã no ã heparohowë, totihitawë thë pë riã rẽ thaiwehei hami, thë pë puhi tao kãi përihiwëha, thë pë puhi kãi katehewëha hawë kama thë pë mashi shĩro përihimopë.", "metadata": { "languages": [ "sqi" @@ -11107,7 +11107,7 @@ { "type": "NarrativeText", "element_id": "6e2772e24613e482dbe3ec725643ea7a", - "text": "Yapese Gubine gidii mani gargeleg nga faileng nibapuf matt\u02bcawen nge rogon. Bay laniyan nipii e nam, ere ngauda ted matt\u02bcaawen e chaa niba chugur ngoded nimod walag dad.", + "text": "Yapese Gubine gidii mani gargeleg nga faileng nibapuf mattʼawen nge rogon. Bay laniyan nipii e nam, ere ngauda ted mattʼaawen e chaa niba chugur ngoded nimod walag dad.", "metadata": { "languages": [ "tgl", @@ -11129,7 +11129,7 @@ { "type": "NarrativeText", "element_id": "dd0ec8c9f26cfc60d56857c55e78705f", - "text": "Yiddish, Eastern \u05d9\u05e2\u05d3\u05e2\u05e8 \u05de\u05e2\u05e0\u05d8\u05e9 \u05f0\u05e2\u05e8\u05d8 \u05d2\u05e2\u05d1\u05f1\u05e8\u05df \u05e4\u05bf\u05e8\u05f2\u05b7 \u05d0\u05d5\u05df \u05d2\u05dc\u05f2\u05b7\u05da \u05d0\u05d9\u05df \u05db\u05bc\u05d1\u05bf\u05d5\u05d3 \u05d0\u05d5\u05df \u05e8\u05e2\u05db\u05d8. \u05d9\u05e2\u05d3\u05e2\u05e8 \u05f0\u05e2\u05e8\u05d8 \u05d1\u05d0\u05b7\u05e9\u05d0\u05b8\u05e0\u05e7\u05df \u05de\u05d9\u05d8 \u05e4\u05bf\u05d0\u05b7\u05e8\u05e9\u05d8\u05d0\u05b7\u05e0\u05d3 \u05d0\u05d5\u05df \u05d2\u05e2\u05f0\u05d9\u05e1\u05df; \u05d9\u05e2\u05d3\u05e2\u05e8 \u05d6\u05d0\u05b8\u05dc \u05d6\u05d9\u05da \u05e4\u05bf\u05d9\u05e8\u05df \u05de\u05d9\u05d8 \u05d0\u05b7 \u05e6\u05f0\u05f2\u05d8\u05df \u05d0\u05d9\u05df \u05d0\u05b7 \u05d2\u05e2\u05de\u05d9\u05d8 \u05e4\u05bf\u05d5\u05df \u05d1\u05e8\u05d5\u05d3\u05e2\u05e8\u05e9\u05d0\u05b7\u05e4\u05bf\u05d8.", + "text": "Yiddish, Eastern יעדער מענטש װערט געבױרן פֿרײַ און גלײַך אין כּבֿוד און רעכט. יעדער װערט באַשאָנקן מיט פֿאַרשטאַנד און געװיסן; יעדער זאָל זיך פֿירן מיט אַ צװײטן אין אַ געמיט פֿון ברודערשאַפֿט.", "metadata": { "languages": [ "heb" @@ -11150,7 +11150,7 @@ { "type": "NarrativeText", "element_id": "33533cecec6c5714680925cbc9d55bb1", - "text": "Yoruba Gbogbo \u00e8n\u00ecy\u00e0n ni a b\u00ed n\u00ed \u00f2m\u00ecnira; iy\u00ec \u00e0ti \u1eb9\u0300t\u1ecd\u0301 k\u1ecd\u0300\u1ecd\u0300kan s\u00ec d\u1ecd\u0301gba. W\u1ecd\u0301n n\u00ed \u1eb9\u0300b\u00f9n ti l\u00e0\u00e1k\u00e0y\u00e8 \u00e0ti ti \u1eb9\u0300r\u00ed\u2010\u1ecdk\u00e0n, \u00f3 s\u00ec y\u1eb9 k\u00ed w\u1ecdn \u00f3 m\u00e1a h\u00f9w\u00e0 s\u00ed ara w\u1ecdn g\u1eb9\u0301g\u1eb9\u0301 b\u00ed \u1ecdm\u1ecd \u00ecy\u00e1.", + "text": "Yoruba Gbogbo ènìyàn ni a bí ní òmìnira; iyì àti ẹ̀tọ́ kọ̀ọ̀kan sì dọ́gba. Wọ́n ní ẹ̀bùn ti làákàyè àti ti ẹ̀rí‐ọkàn, ó sì yẹ kí wọn ó máa hùwà sí ara wọn gẹ́gẹ́ bí ọmọ ìyá.", "metadata": { "languages": [ "vie" @@ -11171,7 +11171,7 @@ { "type": "NarrativeText", "element_id": "263ae4a61b51cca14085f92de5a8cfa5", - "text": "Yukaghir, Northern \u041a\u04e9\u0434\u044d\u04a5 \u0442\u044d\u043d - \u043d\u044c\u0438\u0434\u0438\u0442\u044d \u0431\u0430\u043d\u0434\u044c\u044d \u043f\u0430\u0440\u0430\u051d\u0430\u0430\u043d\u044c\u044d\u0440\u044d\u04a5 \u0442\u0443\u0434\u044d \u0447\u0443\u04a5\u0434\u044d\u043d \u043d\u044c\u0438\u043b\u0434\u044c\u0438\u043b\u044d\u043a \u044d\u043d\u043d\u0443\u043b\u04a5\u0438\u043d\u044c-\u043c\u044d\u0434\u044c\u0443\u043e\u043b\u043d\u0443\u043d\u0438. \u041a\u04e9\u0434\u044d\u04a5 \u044d\u043d\u043c\u0443\u043d \u0447\u0443\u043d\u0434\u044d \u043c\u044d \u043b\u044c\u044d\u0439, \u0442\u0430\u0430\u0442\u043b\u044c\u044d\u0440 \u043b\u0443\u043a\u0443\u043d\u0434\u044c\u0438\u0438 \u043d\u044c\u0438\u043d\u044d\u043c\u0434\u044c\u0438\u0439\u0438\u043b\u043f\u044d \u0434\u0438\u0442\u044d \u044d\u043d\u043d\u0443\u0439\u0443\u043e\u043b-\u043c\u043e\u0440\u0430\u051d\u043d\u044c\u044d\u04a5\u0438.", + "text": "Yukaghir, Northern Көдэҥ тэн - ньидитэ бандьэ параԝааньэрэҥ тудэ чуҥдэн ньилдьилэк эннулҥинь-мэдьуолнуни. Көдэҥ энмун чундэ мэ льэй, таатльэр лукундьии ньинэмдьийилпэ дитэ эннуйуол-мораԝньэҥи.", "metadata": { "languages": [ "rus" @@ -11192,7 +11192,7 @@ { "type": "UncategorizedText", "element_id": "5d93ef013b9a5b75709657ba49153ed9", - "text": "Z\u00e1paro Kawiriaja kayapuina ichaukui ta nuka pucha panicha kupanimajicha cha nuka nishima ikicha kiniana panicha tamanuka kanata ikimajicha.", + "text": "Záparo Kawiriaja kayapuina ichaukui ta nuka pucha panicha kupanimajicha cha nuka nishima ikicha kiniana panicha tamanuka kanata ikimajicha.", "metadata": { "languages": [ "swa" @@ -11213,7 +11213,7 @@ { "type": "NarrativeText", "element_id": "7d1772a7cde57cf4033fb6ecd38d611b", - "text": "Zapotec, G\u00fcil\u00e1 Ra'ta ra bu:unny ra:aaly liebr c\u00ebhnn te'bloh deree'ch c\u00ebhnn dignidaa. Ra:alyne:erih gahll ri:e:eny c\u00ebhnn saalyb, chiru' na:a pahr ga:annza'crih loh sa'rih.", + "text": "Zapotec, Güilá Ra'ta ra bu:unny ra:aaly liebr cëhnn te'bloh deree'ch cëhnn dignidaa. Ra:alyne:erih gahll ri:e:eny cëhnn saalyb, chiru' na:a pahr ga:annza'crih loh sa'rih.", "metadata": { "languages": [ "cym", @@ -11237,7 +11237,7 @@ { "type": "NarrativeText", "element_id": "efe41cb241efcd0774cf2f9bd328b778", - "text": "Zapotec, Miahuatl\u00e1n Diti mien ndied xa yent kuan nkie xa nak rieti xa diba xa rola.", + "text": "Zapotec, Miahuatlán Diti mien ndied xa yent kuan nkie xa nak rieti xa diba xa rola.", "metadata": { "languages": [ "afr", @@ -11262,7 +11262,7 @@ { "type": "NarrativeText", "element_id": "b1bf6eb1c62dbb55df63d0dcd8595d2a", - "text": "Zarma Fayanka kulu no si adamayzey nda care game ra i burcintara nda i alhakey cediraw kayandiya\u014b fondo ra da i na i hay. I gonda lakkal, nda laasaabu, ka\u014b ga na\u014b i ma baafunay \u0272ayzetaray haali ra.", + "text": "Zarma Fayanka kulu no si adamayzey nda care game ra i burcintara nda i alhakey cediraw kayandiyaŋ fondo ra da i na i hay. I gonda lakkal, nda laasaabu, kaŋ ga naŋ i ma baafunay ɲayzetaray haali ra.", "metadata": { "languages": [ "som" diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index 55c78ea5c0..b70eb15b34 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -222,7 +222,7 @@ { "type": "ListItem", "element_id": "1a174e104169cb41cf69393a9cdc0872", - "text": "4. Team science and scientific communication: \u201csoft\u201d skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.", + "text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.", "metadata": { "languages": [ "eng" @@ -310,7 +310,7 @@ { "type": "NarrativeText", "element_id": "690b79e1d449426afb07ed40866a6bb6", - "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM\u2019s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with", + "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with", "metadata": { "languages": [ "eng" diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json deleted file mode 100644 index fdb1b1ff86..0000000000 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.json +++ /dev/null @@ -1,4195 +0,0 @@ -[ - { - "type": "Header", - "element_id": "782cf07be8b3ab8f05188e479edb7f61", - "text": "Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "Data in Brief 22 ( 2019 ) 451 \u2013 457", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 0 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c3e4ba0411db419c34f27ae55762b1c1", - "text": "Contents lists available at ScienceDirect", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "ScienceDirect", - "url": "www.sciencedirect.com/science/journal/23523409", - "start_index": 28 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a983d2e46059a8605ebb1077994e6fa3", - "text": "Data in Brief", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "354cd2b49c1a201a5e91177a17f9b2a3", - "text": "journal homepage: www.elsevier.com/locate/dib", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "www . elsevier . com / locate / dib", - "url": "www.elsevier.com/locate/dib", - "start_index": 18 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "c1c1eeb08eba1d16beccf2034fc87bc8", - "text": "Data Article", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f1b37e8056f39eb82901f43f4fe0a239", - "text": "Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "1a4fcf35fcd5d2be9f843f0fb93f3d3e", - "text": "Omotayo Sanni n, Abimbola Patricia I. Popoola", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "418af174cd1457a5db9b88c3c4a33ce3", - "text": "Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "698747e1178c3e0ec15b2eb293e58565", - "text": "a r t i c l e i n f o", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "19e64efbeabe463d8d8a6f577d4c6be7", - "text": "a b s t r a c t", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "8e23ddc47eb2833b067fe61c9c413955", - "text": "Article history: Received 31 August 2018 Received in revised form 17 November 2018 Accepted 27 November 2018 Available online 30 November 2018", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2b0eb4fb8b32b5944bcf711f448ef19a", - "text": "Keywords: Corrosion Stainless steel Inhibitor Sulphuric acid", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "8930d3f5d6929e72cbe35523538fc807", - "text": "This data article contains data related to the research article entitled \u201cenhanced corrosion resistance of stainless steel Type 316 in sulphuric acid solution using eco-friendly waste product\u201d (Sanni et al., 2018). In this data article, a comprehensive effect of waste product and optimized process parameter of the inhibitor in 0.5 M H2SO4 solution was presented using weight loss and potentiody- the inhibitor namic polarization techniques. The presence of (egg shell powder) in\ufb02uenced corrosion resistance of stainless steel. Inhibition ef\ufb01ciency value of 94.74% was recorded as a result of inhibition of the steel by the ionized molecules of the inhibiting compound of the egg shell powder in\ufb02uencing the redox mechan- ism reactions responsible for corrosion and surface deterioration.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "aa8a123d8b7bf47bd15c389a6685d405", - "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "0757794849e2cca941b30b4e1e82cd4b", - "text": "Speci\ufb01cation table", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "bab7909d0362404432e0cc4f90049b3a", - "text": "Subject area More speci\ufb01c subject area Surface science and engineering Type of data", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "227863137634b2d549494fac759af715", - "text": "Materials engineering", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "3f88b0d8c42101ff25aeb213051cf81f", - "text": "Table and \ufb01gure", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b6664d832b0c853cff911e63ce738371", - "text": "n Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "9b655d4b82dc2b1d75b9c21c7b0fc7f8", - "text": "E-mail address: tayo.sanni@yahoo.com (O. Sanni).", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "tayo . sanni @ yahoo . com", - "url": "mailto:tayo.sanni@yahoo.com", - "start_index": 16 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "96e9fe2b2775d750918a6f92f0d3ad95", - "text": "https://doi.org/10.1016/j.dib.2018.11.134 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 0 - }, - { - "text": "https", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 0 - }, - { - "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 11 . 134", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 0 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "757b62f5ce8ceee7150b7ce16ea16c93", - "text": "452", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "fb14c87d94f1676010e46b776d688612", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "72155e648a45896b081904929fc91cc6", - "text": "How data were acquired", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a577cc1dfaa481812a9cff86c06d9835", - "text": "Data format Experimental factors", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "9b9d298aef0e8b4a83bca09152a07128", - "text": "Experimental features Data source location", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6f850529ced475435229c193a8ee7938", - "text": "Accessibility Related research article", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c1c91f3ea75c102b6ed42b94530cbafe", - "text": "The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition ef\ufb01ciency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225\u2013230.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a5dd74871d789945bd8a9c352d4817fb", - "text": "Value of the data", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "9bed69cd8287b2725bd845ca61ebb3cd", - "text": "(cid:1) Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "2ac3a042a8c89fd81718d1fda7ae576b", - "text": "(cid:1) The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "4962aa80bf0712155f4b781df06b4f1a", - "text": "(cid:1) The data can be used to examine the relationship between the process variable as it affect the", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "3b419c2d586d0eaf047f939c9e41b30f", - "text": "nature of inhibition of metals.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "f742be9cbb2d0697a88a9f749bf3185c", - "text": "1. Data", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "28d5b195997810a34c2aa96c9f357de2", - "text": "The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs.1\u20133 respectively. It can be seen clearly from these Figures that the ef\ufb01ciency of egg shell powder increase with the inhibitor con- centration, The increase in its ef\ufb01ciency could be as a result of increase in the constituent molecule", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f3a850e6bd8c0557408ad59167f5461e", - "text": ") g m", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "3cb4a395dab98ecdc71ad325411cf150", - "text": "(", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2b2ff92863f302ae630dc410b945333a", - "text": "s s o", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "0da3f5fd0fd07fc182d371760d9da3c0", - "text": "l", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f929b69f05a08ec2b940c9b531740326", - "text": "t h g e W", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f0fbafddf553bdea61ac009ad080f1bc", - "text": "i", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2b3d55b9ce69bcd15d67071cf0d11814", - "text": "30", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "9673d82062115826d94732418d566ba2", - "text": "20", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b0304d4851460afe7c95d41feb260093", - "text": "10g 8g 6g 4g 2g Control", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "7f646e71d7bc0398e9917eec2c29b9ef", - "text": "10", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "12a72cb263173964cf41736e5d3707b2", - "text": "48", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "673fe20c15c1210d134b56828c5a8216", - "text": "96", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "c552ee9963f985fd6b3498e2cf2c6230", - "text": "144", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "16e471ece5a33bfb80b79b89aed6c731", - "text": "192", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "829e97853a2843ff6a8f1cfd3a6c74db", - "text": "Exposure Time (Hours)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "b6f97c1cdf0e9f1abebac577d4cf4b2a", - "text": "Fig. 1. Weight loss versus exposure time for stainless steel presence of ES.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "09a5818257d4c970dc57191f38e1c1b0", - "text": "immersed in 0.5 M H2SO4 solution in the absence and", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "828e27fb21b2ca5e25ebdc5f0693ed7d", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "81cbf4e59dfe4444a94794a547e9063c", - "text": "2.7", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "f1b0da24500b1f98c9debd55a2482b7f", - "text": ") r a e y / m m", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "9efd31c777cb3a30d24545982e71644e", - "text": "( e t a r n o s o r r o C", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a535b571914bff036ee8d7b941a9e14c", - "text": "i", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "6445348d57f8715d980bbf266f6cc4b3", - "text": "1.8", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "dff5188d0e9db124ca45b71e4123404f", - "text": "0.9", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2e8665917db0a5ca56fee4e99f113c05", - "text": "10g 8g 6g 4g 2g Control", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "9b38508e1e3ddd8056482945216e1a28", - "text": "24", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "4638ab00ad25c2044ed18ba57b766d7d", - "text": "48", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "252b95fc79d992358f5e7e4423febe14", - "text": "72", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "963002fc37d4568e01e1361b0f053b53", - "text": "96", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "292f8084988c4f4000fcd5bd2205c36a", - "text": "120", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "5c317addf6947e11fba4c4f584f095c1", - "text": "144", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "95649afacb76442d050ed4534b80c4cc", - "text": "168", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "dad2b03f8f9d732efa19ab6a421e971d", - "text": "192", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "8f500e748d82811ccbb3b715e1932be6", - "text": "Exposure time", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "03f95f2413bbe205cdc6975b1b98ecbe", - "text": "Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "3c32d78e905ba61d1ae55e0b2ebd5946", - "text": "100", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "78e1f4ff627e16f8159327279bdfcce0", - "text": "90", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "748c1e92cccf809f3776382792e93895", - "text": ")", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "feccbab23ec407ef6cc22348a78244d3", - "text": "%", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "03ac492dccd89cf13a9d40ada0e543e1", - "text": "(", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "2a02254b1d03abddd3537dc16c56a6fb", - "text": "y c n e c i f f", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "67504491ab6c6c3603a75d246c50f54d", - "text": "i", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "6a2c597e6f8cfa0954a022873f9dcf6f", - "text": "E n o i t i b h n I", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f84aae3bf521f4166f63e87b5ef4f035", - "text": "i", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "b76e96beb931beaef6e3660f5d415c3d", - "text": "80", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "0309a67bcfd5df32328af8c537c708e6", - "text": "70", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "33add4c83afdffa0745406aea3c75b49", - "text": "60", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e180205da17abbe716978d5c4aa4dd03", - "text": "50", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "18f47de0e9dbec383a50a39027960bc6", - "text": "40", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "89ac5d03f7c6d4fa92bda587be577ab8", - "text": "30", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "93a1080514211ba59a1850d5600c261c", - "text": "2g 4g 6g 8g 10g", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a66d7b20adfb12a1efd70da1d5b65375", - "text": "20", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "82bf75b4e447974f22e48c9a450c45d5", - "text": "10", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "d460a5ac4c345529812f84dabf681d9f", - "text": "0", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a6282e95f41f8cb5061e0618a02dc09a", - "text": "20", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "44e027245f6667d8282ec4728ad9c2dd", - "text": "40", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "935862a8bb1abed65afc07fc8d1da166", - "text": "60", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "fada482b9f03a3eda9be2ad92169bc9a", - "text": "80", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "3179f53a093e5bb8064b777a8125c88e", - "text": "100", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2053a3a5b1e12481504583f7f72979ff", - "text": "120", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "b81dbb6336d2b992478316f8514e94b6", - "text": "140", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "d4eb5e157598e6fa21a6b5b4254e9b5e", - "text": "160", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "f082a93dce4872ddd5ecc97c3a9341fb", - "text": "180", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "4c19db10f909537bf29da9829ab6f81b", - "text": "Exposure Time (Hours)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c566a56fa9e9ad6b97408310e357b079", - "text": "Fig. 3. Inhibition ef\ufb01ciency versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the presence of ES.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "21233d8e249dd8180c7f2c99a468f337", - "text": "number of inhibitor adsorbed on the surface of stainless steel at higher concentration, in order for the active sites of the stainless steel to be protected with the inhibitor molecules. Cathodic and anodic polarized potential are measured in the presence and absence of ES. Fig. 4 shows the cathodic and anodic polarization curves for stainless steel in 0.5 M H2SO4 solution at different ES concentrations. The electrochemical variables such as polarization resistance (PR), corrosion potential (Ecorr), cor- rosion current (icorr), anodic Tafel constant (ba), cathodic Tafel constant (bc) and corrosion rate (mm/ year) values are presented in Table 1. From the polarization curves and electrochemical parameter, icorr value decreased with the addition of inhibitor in 0.5 M H2SO4. Conversely, the icorr further decrease with an increase in inhibitor concentration indicating that the inhibition effects increase with an increase in the egg shell concentration. The process of egg shell inhibition could be attributed to the formation of egg shell powder adsorbed on stainless steel surface protecting corrosion of stainless steel in H2SO4 medium. The likely mechanism is the egg shell adsorption on stainless steel surface through the heteroatoms electron pair and the conjugated systems in egg shell molecular structure as shown in Fig. 1. When the concentration of inhibitor was increased from 2 to 10 g, the corrosion rate values drastically decreased this result show that waste egg shell powder is an effective corrosion inhibitor for stainless steel in H2SO4 solution. The shift in corrosion potential of stainless steel from Tafel curves and electrochemical data indicate that the inhibitor is a mixed-type corrosion inhibitor.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "443e25a2b54b8b2a43f8029e07f784b3", - "text": "453", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "33b112b0d8640ab4f13b22a2ee714086", - "text": "454", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "e87ca7b3cd075aaa0de8030768aca87c", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "fd8a0feb5e755ece5d9abceb844649ff", - "text": "Fig. 4. Anodic and cathodic polarization curve of stainless steel in 0.5 M H2SO4 solution in the presence and absence of ES.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "598ed0a58406fc921332297f345b177a", - "text": "Table 1 Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO4 solution.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "9620a738189422654c5456fa16e507e7", - "text": "Inhibitor concentration (g)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "3acf3c88a28cad76984ac041a8f5984c", - "text": "bc (V/dec)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "da72962f658cee29281fa0e11a548813", - "text": "ba (V/dec)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "63a8b6b360c7a61ef88ad6c0b3d6581d", - "text": "Ecorr (V)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "616ac8133f9b985812240add98badf5a", - "text": "icorr (A/cm2)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "5ef6c0b5c5c72f20a694c6bce97ed131", - "text": "Polarization resistance (\u03a9)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6eff2d13b846a74ce08e348c7151dd1c", - "text": "Corrosion rate (mm/year)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "4a00cd3d6d5f9b71b105586a17125069", - "text": "0 2 4 6 8 10", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "812204070320132126dcfec00abb07f7", - "text": "0.0335 1.9460 0.0163 0.3233 0.1240 0.0382", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "08c96eb52fe4877d6a26d862f8919d35", - "text": "0.0409 0.0596 0.2369 0.0540 0.0556 0.0086", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a0aa9bf2a48ed1dff882a16cb320c616", - "text": "(cid:3)0.9393 (cid:3)0.8276 (cid:3)0.8825 (cid:3)0.8027 (cid:3)0.5896 (cid:3)0.5356", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a725c31d8b684d978174d4dc11d29106", - "text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "f66516a9a89cb0ab07ccf9e15086f394", - "text": "24.0910 121.440 42.121 373.180 305.650 246.080", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a6663f53eba15d4c5596b1f8ec4208fd", - "text": "2.8163 1.5054 0.9476 0.4318 0.3772 0.0919", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "f5db77e611b74b7298f1b48a82ffc7be", - "text": "The plot of inhibitor concentration over degree of surface coverage versus inhibitor concentration gives a straight line as shown in Fig. 5. The strong correlation reveals that egg shell adsorption on stainless surface in 0.5 M H2SO4 follow Langmuir adsorption isotherm. Figs. 6\u20138 show the SEM/EDX surface morphology analysis of stainless steel. Figs. 7 and 8 are the SEM/EDX images of the stainless steel specimens without and with inhibitor after weight loss experiment in sulphuric acid medium. The stainless steel surface corrosion product layer in the absence of inhibitor was porous and as a result gives no corrosion protection. With the presence of ES, corrosion damage was minimized, with an evidence of ES present on the metal surface as shown in Fig. 8.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e4e5f97ab5b56767ed489d7cd3ee04f7", - "text": "12", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "afc0a737ef1e5ffa9d6b72bb32fef683", - "text": "C/0", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "d9a38658d857c1141618ad9115dc48b4", - "text": "10", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2d046240fd1a0ff3420926f0a54e0aaa", - "text": "8", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "4c136188f1e2e974ec1003968916824a", - "text": "0 / C", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "594366da1ff6e7a343ec1666c5852389", - "text": "6", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "d84c13ba166bd29d042db10acba6d243", - "text": "4", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "d4210b5ce6f99e242d8c1aa586691286", - "text": "2", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "7afb08e1cc308afebdc038fc7e4595ed", - "text": "2", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "696d24804069bc593dc624bf7ba904e2", - "text": "4", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "ef054383c29789c2743d93a6189f7f47", - "text": "6", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "ae2f6fc244a6aa053403e38912fdc56a", - "text": "8", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "33c153482d9c925a35781bd5c9697648", - "text": "10", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "8f325f6eb1678922e83e32746b981b80", - "text": "Concentration (g)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "9d46c2166a49c9e3a75ed98cb20ce13f", - "text": "Fig. 5. Langmuir adsorption isotherm of ES.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "9d639b03d26ec1872a4e91ac99031fdf", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 5, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "cfea47dcbf32f3d8597e777afa74d20e", - "text": "Fig. 6. SEM/EDX image of as-received stainless steel.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 5, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "a1e6c9bab7935444a7491a47091be10c", - "text": "Fig. 7. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution without inhibitor.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 5, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "49e093091da774c567151e5147c70027", - "text": "Fig. 8. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution with the presence of inhibitor.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 5, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "8ac2e9f97dc89f9d9bac5baec281f7f2", - "text": "455", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 5, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e303e27893be099ef5fd03235efee7fe", - "text": "456", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "91c8bf5283b45a71164a103f496f93c1", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "bffefa92b06bc6009f81965d3dadc0ce", - "text": "2. Experimental design, materials and methods", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "484707d26d81d85df99f322c1bbb8ca3", - "text": "2.1. Material", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "79d10fe9600d8d3428b5df86faa7c099", - "text": "Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [3\u20135]. The structural formula of egg shell powder is shown in Fig. 9.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b6bd160c80816ff7b2d8a36ccfc67568", - "text": "Fig. 9. Chemical structure of egg shell powder.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "aeafe864b565b167f053a348390b3eff", - "text": "2.2. Weight loss method", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "0e51f945cacb5ec184a3613487b6fefb", - "text": "This physical measurement was carried out in order to provide direct result on how the corrosive environment affects the test sample. The cleaned and weighed specimen was suspended in beakers with the aid of glass hooks and rods with the test solution of ES at different concentration (2, 4, 6, 8 and 10 g). The pre-weighed specimen was retrieved from the test solution after every 24 h, cleaned, dried and reweighed. The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss which was used to calculate corrosion rate and inhibition ef\ufb01ciency.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "fed48b9de93d4324223aa5fbdfe2f359", - "text": "The corrosion rate (CR) was calculated using Eq. (1) [1\u20135]", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2c4a913c3a4b8bccd9c7003f25ae25af", - "text": "(cid:1) \u00de \u00bc 87:6W DAT", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "902d0aabf523c467c200f5203957e606", - "text": "(cid:3)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "44d54b6fb44ac7afc9f40a0e7a5fcde3", - "text": "Corrosion rate CR\u00f0", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "7459b20ea68d65b7a967500f22223507", - "text": "where: W is weight loss in mg, A is specimen surface area, T is immersion period in hours and D is the specimen density. From the corrosion rate, the surface coverage (\u03b8) and inhibition ef\ufb01ciencies (IE %) were determined using Eqs. (2) and (3) respectively", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "543caecd15c161082076a174ea946782", - "text": "\u03b8 \u00bc CRo(cid:3)CR", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b2cc1eda5ffbccf6416235c44181538c", - "text": "CRo", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "59a609931ac8f9c55855113bfae6655e", - "text": "IE \u00f0%\u00de \u00bc CRo(cid:3)CR", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "3bf244c1b2eb32875b292a28c130aba4", - "text": "CRo", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2c6d5581a35c83236153f78c5b53cb60", - "text": "x", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "ca4aeca8c2a7e6b9df923db4a5902289", - "text": "100 1", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "a47048cff18528a9a4838728a55e526a", - "text": "where: CRo and CR are the corrosion rate in absence and presence of inhibitor respectively.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6aabbfd8e92223470a6c9184a84857c0", - "text": "2.3. Potentiodynamic polarization method", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c653c9cca5ebdd3089b705f279316500", - "text": "The potentiodynamic polarization method was performed on the prepared test samples immersed in 0.5 M H2SO4 solution in the presence and absence of different ES concentrations. A three electrode system was used; stainless steel Type 316 plate as working electrode with an exposed area of 1.0 cm2, platinum rod as counter electrode and silver chloride electrode as reference electrode. The electrode was polished, degreased in acetone and thoroughly rinsed with distilled water before the experiment. Current density against applied potential was plotted. The slope of the linear part in anodic and cathodic plots gives anodic and cathodic constants according to the Stern\u2013Geary equation, and the", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b1cdefa47658616bf79766f8fc353f7c", - "text": "\u00f01\u00de", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a1a035eeaa7c25a2b543757f4cc7d0fb", - "text": "\u00f02\u00de", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "74d17735c911d69b6d10e05d0c9d79d6", - "text": "\u00f03\u00de", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 6, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "e40c3ee561b10ca5b7a76900c8d5b263", - "text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451\u2013457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "ac11629522e563b6a0a8f261ab4b94e0", - "text": "steps of the linear polarization plot are substituted to get corrosion current. Nova software was used with linear polarization resistance (LPR) and the current was set to 10 mA (maximum) and 10 nA (minimum). LSV staircase parameter start potential (cid:3)1.5 v, step potential 0.001 m/s and stop potential of \u00fe1.5 v set was used in this study.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2461424bae61c8cfad1cd33a949843f0", - "text": "Acknowledgements", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "2d8a74bbba4ad3bb13afc8a98daec91d", - "text": "This work was supported by the National Research Foundation of South Africa and the Tshwane", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "154e2a7bdebd1347eccb08f349284130", - "text": "University of Technology Pretoria South Africa.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "41a46b0a6852a31b1e51cf65a4ecf87d", - "text": "Transparency document. Supporting information", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c5635281e7e879dd338b99ae84f94056", - "text": "Transparency document associated with this article can be found in the online version at https://doi.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// doi .", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 89 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "ee62928948d5d7b5e13edf65d917dc63", - "text": "org/10.1016/j.dib.2018.11.134.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "org / 10 . 1016 / j . dib . 2018 . 11 . 134", - "url": "https://doi.org/10.1016/j.dib.2018.11.134", - "start_index": 0 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "dbe83d8d2b6784a17d8faae3633b97f9", - "text": "References", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d08513d888e4133fda75841dd05273d9", - "text": "[1] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi , Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", - "start_index": 4 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "29736d79aeb1e5fc195876dbf12f1c57", - "text": "using eco-friendly waste product, Results Phys. 9 (2018) 225\u2013230.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "using eco - friendly waste product , Results Phys . 9 ( 2018 ) 225 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", - "start_index": 0 - }, - { - "text": "using eco - friendly waste product , Results Phys . 9 ( 2018 ) 225 \u2013 230 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref1", - "start_index": 0 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "ca40f2c0d5a95e8cddab1c3b76f95e9e", - "text": "[2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "O . Sanni , A . P . I . Popoola , A . Kolesnikov , Constitutive modeling for prediction of optimal process parameters in corrosion", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", - "start_index": 4 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "e42cb45853ffd3e2c81095a126918c6c", - "text": "inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1\u201315.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "inhibition of austenitic stainless steel ( Type 316 )/ acidic medium , Mater . Res . Express . 5 ( 10 ) ( 2018 ) 1 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", - "start_index": 0 - }, - { - "text": "inhibition of austenitic stainless steel ( Type 316 )/ acidic medium , Mater . Res . Express . 5 ( 10 ) ( 2018 ) 1 \u2013 15 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref2", - "start_index": 0 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "610ae41b07604b353631457b9a4ad632", - "text": "[3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "O . Sanni , A . P . I . Popoola , O . S . I . Fayomi , The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", - "start_index": 4 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "ae14702f67ee1c5d2e5316e8344a6971", - "text": "corrosion in chloride solution, Def. Technol. 14 (2018) 463\u2013468.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "corrosion in chloride solution , Def . Technol . 14 ( 2018 ) 463 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", - "start_index": 0 - }, - { - "text": "corrosion in chloride solution , Def . Technol . 14 ( 2018 ) 463 \u2013 468 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31527-0/sbref3", - "start_index": 0 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d1c8e3e15192f1bdcda9cf8e38a5573f", - "text": "[4] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1\u201317. https://doi.org/10.1007/ s13632-018-0495-5.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https", - "url": "https://doi.org/10.1007/s13632-018-0495-5", - "start_index": 233 - }, - { - "text": "https", - "url": "https://doi.org/10.1007/s13632-018-0495-5", - "start_index": 233 - }, - { - "text": "https :// doi . org / 10 . 1007 /", - "url": "https://doi.org/10.1007/s13632-018-0495-5", - "start_index": 233 - }, - { - "text": "s13632 - 018 - 0495 - 5", - "url": "https://doi.org/10.1007/s13632-018-0495-5", - "start_index": 258 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "3827d49ec98a215986f78d1df2ae2d33", - "text": "[5] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Inhibition of engineering material in sulphuric acid solution using waste product, Contributed Papers from Materials Science and Technology (MS&T18), 2018. \u2329https://doi.org/10.7449/2018/MST_2018_254_261\u232a.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// doi . org / 10 . 7449 / 2018 / MST _ 2018 _ 254 _ 261", - "url": "https://doi.org/10.7449/2018/MST_2018_254_261", - "start_index": 202 - } - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "7fbcd3b873966a649efd837300e0c576", - "text": "457", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 7, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/65/11/main.PMC6312790.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json deleted file mode 100644 index 908e9e125a..0000000000 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.json +++ /dev/null @@ -1,2514 +0,0 @@ -[ - { - "type": "Header", - "element_id": "d25e5f46b5be5f4c8a6573d0688dae93", - "text": "Data in Brief 22 (2019) 484\u2013487", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "Data in Brief 22 ( 2019 ) 484 \u2013 487", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 0 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "ffd4c08fe1f13ed4b1c1c523ead5510b", - "text": "Contents lists available at ScienceDirect", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "ScienceDirect", - "url": "www.sciencedirect.com/science/journal/23523409", - "start_index": 28 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "ab45cdb29d177758321b79d0e5430958", - "text": "Data in Brief", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b6ed6a9bb542e0891cebca3fa85e6bcd", - "text": "journal homepage: www.elsevier.com/locate/dib", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "www . elsevier . com / locate / dib", - "url": "www.elsevier.com/locate/dib", - "start_index": 18 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "1acc2228e407a58c34b39c30aed641fe", - "text": "Data Article", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "798dd79fdd2f8266cf92f28200198e08", - "text": "A benchmark dataset for the multiple depot vehicle scheduling problem", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "8edd00e1188d7cb75051b1998ee494a9", - "text": "Sarang Kulkarni a,b,c,n, Mohan Krishnamoorthy d,e, Abhiram Ranade f, Andreas T. Ernst c, Rahul Patil b", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "7d3eb41c30b752ac6026851e8119f642", - "text": "a IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India b SJM School of Management, IIT Bombay, Powai, Mumbai 400076, India c School of Mathematical Sciences, Monash University, Clayton, VIC 3800, Australia d Department of Mechanical and Aerospace Engineering, Monash University, Clayton, VIC 3800, Australia e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072, Australia f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "3f086bae7b6270727b6fca8ba4563fd7", - "text": "a r t i c l e i n f o", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "a951e8fba28630797a561ae24142f1b9", - "text": "a b s t r a c t", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "90549df65b3824f67f0290bc96644155", - "text": "Article history: Received 21 November 2018 Received in revised form 13 December 2018 Accepted 15 December 2018 Available online 18 December 2018", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "3e158fd01d34697ac14890732b84a1fc", - "text": "This data article presents a description of a benchmark dataset for the multiple depot vehicle scheduling problem (MDVSP). The MDVSP is to assign vehicles from different depots to timetabled trips to minimize the total cost of empty travel and waiting. The dataset has been developed to evaluate the heuristics of the MDVSP that are presented in \u201cA new formulation and a column generation-based heuristic for the multiple depot vehicle sche- duling problem\u201d (Kulkarni et al., 2018). The dataset contains 60 problem instances of varying size. Researchers can use the dataset to evaluate the future algorithms for the MDVSP and compare the performance with the existing algorithms. The dataset includes a program that can be used to generate new problem instances of the MDVSP.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "298de5d25d4db319d8cb1c4da4e14411", - "text": "& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "25ce21c9671271c1639f549d88644f16", - "text": "DOI of original article: https://doi.org/10.1016/j.trb.2018.11.007 n Corresponding author at: IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 . 007 n Corresponding author at", - "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", - "start_index": 25 - }, - { - "text": "https", - "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", - "start_index": 25 - }, - { - "text": "https :// doi . org / 10 . 1016 / j . trb . 2018 . 11 . 007", - "url": "http://dx.doi.org/10.1016/j.trb.2018.11.007", - "start_index": 25 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b4b1b0bb1bf27aa4de6d404b9304fb02", - "text": "E-mail address: sarangkulkarni@iitb.ac.in (S. Kulkarni).", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "sarangkulkarni @ iitb . ac . in", - "url": "mailto:sarangkulkarni@iitb.ac.in", - "start_index": 16 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "3bf8a8c86295c8d68682ff1c4594b485", - "text": "https://doi.org/10.1016/j.dib.2018.12.055 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 0 - }, - { - "text": "https", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 0 - }, - { - "text": "https :// doi . org / 10 . 1016 / j . dib . 2018 . 12 . 055", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 0 - } - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "690f7bab68c635029827f497e6c2b218", - "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484\u2013487", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e93f43b23b30a616389e12f193fdf212", - "text": "485", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "8b5f19753e010793be1dd03a4efe1876", - "text": "Speci\ufb01cations table", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "b592fc872f2d852ad0242b2353e61673", - "text": "Subject area Operations research More speci\ufb01c subject area Vehicle scheduling Type of data How data were acquired", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d2073c6354217f9b2d4d5c654d77f232", - "text": "Tables, text \ufb01les Arti\ufb01cially generated by a C\u00fe \u00fe program on Intels Xeons CPU E5\u2013 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8,12,16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457\u2013487 [3].", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// orlib . uqcloud . net /", - "url": "https://orlib.uqcloud.net/", - "start_index": 383 - } - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "156810b54dfdfa06606b2ab9c20e5936", - "text": "Data format Experimental factors", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "f10143ddfaeadcb83593edbd06f6dae5", - "text": "Experimental features Data source location Data accessibility Related research article", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "61e613d4cdb2f24fcb40060db45431c0", - "text": "Value of the data", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d0dfba5954b055b335476e9249b9a73c", - "text": "(cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2956461e611848aeaccd16b99fc03400", - "text": "performance of the algorithms for the MDVSP.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "2f732a3a72336ba52b0b0de6d0008640", - "text": "(cid:2) The data provide all the information that is required to model the MDVSP by using the existing", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "5bd31208ba63e7a44aeea1fd4d721d54", - "text": "mathematical formulations.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "038f53e4bdc8c6ea7b1c63f1b9a73e2f", - "text": "(cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "15906f62459fa76ddadb7a7ef1ce556b", - "text": "be used for the comparison.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "4a39c62bb4f7476ec42fd81325ea6f19", - "text": "(cid:2) The dataset includes a program that can generate similar problem instances of different sizes.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "414bd3131cd65d5c68e1c7a140297506", - "text": "1. Data", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "52c2b4b09c228b90a487fa4fd42a1590", - "text": "The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate \ufb01le. Each \ufb01le is named as \u2018RN-m-n-k.dat\u2019, where \u2018m\u2019, \u2018n\u2019, and \u2018k\u2019 denote the number of depots, the number of trips, and the instance number \u2018RN-8\u20131500-01.dat\u2019, for is the \ufb01rst problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, \u00f0m;n\u00de, \ufb01ve instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// orlib . uqcloud . net", - "url": "https://orlib.uqcloud.net", - "start_index": 609 - } - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a442f6b8548f2b2be7eb0b0c488eaf3f", - "text": "\u2018\u00f0m;n\u00de\u2019,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a1d0fff4ecc99ed0b3792f63af7ac732", - "text": "the size,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "18ddc61212b977693c3ab4a9e2a98213", - "text": "respectively. For example,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "f5af2f4ccedef8e9c9222943207ddce1", - "text": "the problem instance,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "20a5ace34ab61e08b1ab35c222c6554f", - "text": "For each problem instance, the following information is provided: The number of depots m\u00f0 The number of trips \u00f0n\u00de, The number of locations \u00f0l\u00de, The number of vehicles at each depot, For each trip iA1;2;\u2026;n, a start time, ts", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "f1d7de16fe466b5c9f0396600da6d3ef", - "text": "\u00de,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "d07db900a92fbc399e2eac5e0fc704ee", - "text": "i , a start location, ls", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "812eeb4f274baf14170f2447204a4a55", - "text": "i, an end time, te", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "4b917219b5939da4a52a907db733f551", - "text": "i, and an end location, le i ,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "84e91ae08f7e4ae8996bb4cdbbfb9e32", - "text": "and", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "b1bb94d45fba27ddeefd146fbde1dcc4", - "text": "(cid:2) The travel time, \u03b4ij, between any two locations i;jA1;\u2026;l.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "5e73cd663ab2449350114f86e23f6bbb", - "text": "All times are in minutes and integers. The planning duration is from 5 a.m. to around midnight. Each instance has two classes of trips, short trips and long trips, with 40% short trips and 60% long trips. The duration of a short trip is less than a total of 45 min and the travel time between the start", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 2, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "87149858e00c98f10a2b08be1b8d584a", - "text": "486", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "5fc26c03275c46c5eb2ae66c0c288d2b", - "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484\u2013487", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "eeba8dd874b520a36aa718db99dbfd38", - "text": "and end location of the trip. A long trip is about 3\u20135 h in duration and has the same start and end location. For all instances, mrl and the locations 1;\u2026;m correspond to depots, while the remaining locations only appear as trip start and end locations.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "36bb62577b390f929d88ed7d004c1e3e", - "text": "i \u00fe\u03b4", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c4a028a7e5a91a69b88a778ed1d4c4c1", - "text": ". If le i ls le i j, otherwise, the vehicle may require waiting at le i for the duration of \u00f0ts", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "3351f34f87afe9cffe4fd31320b9ccc8", - "text": "Zte", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "7a378649c353830c59db2e86df7f7368", - "text": "als", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "5066fe5d8ca5d5f91f7312ec35a9a7e8", - "text": "A trip j can be covered after trip i by the same vehicle, if ts j", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "f7296ef349382c5db6f8a271d8f3fe03", - "text": "j, the vehicle must travel empty from le j (cid:3)te i \u00de. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satis\ufb01ed:", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "871530d7bbaa529bbc177fc2a041720e", - "text": "j", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "bfd40d52e047822b7bc341a4741f1f73", - "text": "i to ls", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "a8f50afa154ed8c4545362eeb8ca5799", - "text": "1. Each schedule should start and end at the same depot. 2. Each trip should be covered by only one vehicle. 3. The number of schedules that start from a depot should not exceed the number of vehicles at the depot.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "3dbb489d8594d6744d2fce9cdcde691c", - "text": "A suf\ufb01cient number of vehicles are provided to maintain the feasibility of an instance. For each instance size \u00f0m;n\u00de, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over \ufb01ve instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "7490a379155c95007ad9649ec7689e35", - "text": "The description of the \ufb01le for each problem instance is presented in Table 2. The \ufb01rst line in the \ufb01le provides the number of depots \u00f0m\u00de, the number of trips, \u00f0n\u00de, and the number of locations \u00f0l\u00de, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, iA 1;\u2026;n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i;jA 1;\u2026;l", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "924fc12bebb375f9c74313489cf16217", - "text": "f", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "028c5c64e9591944e620e8308f516b5a", - "text": "(cid:1)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "ce73daceb6d992f6af62cceb4a3d424f", - "text": "(cid:3)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "4c3e98e95e0007df7a9e116f5df403c8", - "text": ".", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "0b37e732b73efa9dbd994f164dac8d5c", - "text": "The dataset also includes a program \u2018GenerateInstance.cpp\u2019 that can be used to generate new instances. The program takes three inputs, the number of depots \u00f0m\u00de, the number of trips \u00f0n\u00de, and the number of instances for each size \u00f0m;n\u00de.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "155c4752aa12e6b82164f5ac49103a19", - "text": "Table 1 Average number of locations, times, vehicles and empty travels for each instance size.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6d92abd137f1e1a6f7d9ecfa1edb0cf4", - "text": "Instance size (m, n)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "bcd163c5719297fd86b9eebacf8a9c24", - "text": "Average number of", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "204a9747099a8efd4aa0b05c9e5c38d2", - "text": "Locations", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "327cb3d0fb60857fee3d8f0c2c78d613", - "text": "Times", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6592bb72dcd3912aa6fabc3df525aeda", - "text": "Vehicles", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "80ce4476651a7ac735c554343aeb749f", - "text": "Possible empty travels", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "71a7492ba9c12eef52065aabaebc3a7c", - "text": "(8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000)", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "7701857f59bdba5844b24edc32749d05", - "text": "568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2bf95679e315fbbd9f0ceb0ce36d9197", - "text": "975.20 1048.00 1078.00 1113.20 994.00 1040.60 1081.00 1107.40 985.40 1040.60 1083.20 1101.60", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "da4ae500af3e46e7446a28cddd32679c", - "text": "652.20 857.20 1082.40 1272.80 642.00 861.20 1096.00 1286.20 667.80 872.40 1076.40 1284.60", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e21d6005188c8a7bfcb95e42868b986c", - "text": "668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 3, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "fa23407a7c3c99ae3b6fb79034698807", - "text": "S. Kulkarni et al. / Data in Brief 22 (2019) 484\u2013487", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "0a4152d3ee312a3d28cc2b63d6f59a6e", - "text": "Table 2 Description of \ufb01le format for each problem instance.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "d66486bdc6e5b4d6e2018f7da6d0b0d0", - "text": "Number of lines", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6c56043a98b068693db3cd6ded0bc020", - "text": "Number of columns in each line", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "2fc6800b1896d3d2779ee6e98794bdb1", - "text": "Description", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "a5efd069cfcb8d3c983dfab2b9336b0e", - "text": "1 1 n", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "1d96bbba9ffa9a12e81da0426f80a9fc", - "text": "l", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "25f80b4c6652f9af1a6883a6e4b8c0bb", - "text": "3 m 4", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "516ec572955aa07f031d27cc89008615", - "text": "l", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c981c256386d57e68a2c947147f30229", - "text": "The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i \u00bc 1;2;\u2026;n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, \u03b4ij; where i;jA1;2;\u2026;l, refers to the travel time between location i and location j.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "e6e8997790263be5ca103754ee56e234", - "text": "i, the start", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "49f536ed0f91f7e6d8ad1d70d71991b0", - "text": "i, the end location le", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "0f605e650a81abc6b5a30423d60d0975", - "text": "2. Experimental design, materials, and methods", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "37200c447b8f7e1443b707c1e76e66b0", - "text": "The procedure presented by Carpaneto et al. in [1] is used to generate the problem instances. The same procedure has been used by Pepin et al. in [4] to generate the benchmark dataset of the MDVSP. A detailed description of the procedure is presented in [3].", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "92e466c917445c0d473eea592acc3b72", - "text": "Our dataset provides start/end location and time of trips as well as the travel time between any two locations. The location and time information is required to model the MDVSP on a time-space network. The feasible connections and the cost of connections between the trips can be obtained as discussed in [3]. Thus, the dataset has all the information that is required to model the MDVSP on the time-space network (see [2]) as well as the connection-network (see [5]). The benchmark solutions for all the problem instances are presented in [3].", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d89dfb5247b731abfe90aedc46c09806", - "text": "Transparency document. Supporting information", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "9a157bb2a3ee3ac55ecf743df0020ce9", - "text": "Transparency document associated with this article can be found in the online version at https://doi.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "https :// doi .", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 89 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "fb1ccb68103598fae7cc8128c97711d9", - "text": "org/10.1016/j.dib.2018.12.055.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "org / 10 . 1016 / j . dib . 2018 . 12 . 055", - "url": "https://doi.org/10.1016/j.dib.2018.12.055", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "a63064fd9987765c33c9d20047dc2f15", - "text": "References", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "909007a841d32eb20886f7fc2d923911", - "text": "[1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "G . Carpaneto , M . Dell ' Amico , M . Fischetti , P . Toth , A branch and bound algorithm for the multiple depot vehicle scheduling", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", - "start_index": 4 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "b1902a32b19337484e93efd9509a07c1", - "text": "problem, Networks 19 (5) (1989) 531\u2013548.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "problem , Networks 19 ( 5 ) ( 1989 ) 531 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", - "start_index": 0 - }, - { - "text": "problem , Networks 19 ( 5 ) ( 1989 ) 531 \u2013 548 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref1", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "5a7cc4a5afb4c97c546a3b64cb4f593f", - "text": "[2] N. Kliewer, T. Mellouli, L. Suhl, A time\u2013space network based exact optimization model for multi-depot bus scheduling, Eur.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "N . Kliewer , T . Mellouli , L . Suhl , A time \u2013 space network based exact optimization model for multi - depot bus scheduling , Eur .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", - "start_index": 4 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "6a1cb7145ede91c5d2e6bb53b4d59f65", - "text": "J. Oper. Res. 175 (3) (2006) 1616\u20131627.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "J . Oper . Res . 175 ( 3 ) ( 2006 ) 1616 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", - "start_index": 0 - }, - { - "text": "J . Oper . Res . 175 ( 3 ) ( 2006 ) 1616 \u2013 1627 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref2", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "439a02aad982d445100cc246cd066b53", - "text": "[3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "S . Kulkarni , M . Krishnamoorthy , A . Ranade , A . T . Ernst , R . Patil , A new formulation and a column generation - based heuristic", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", - "start_index": 4 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "46a8bd54aa6c1bd32118f4a681faaec9", - "text": "for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457\u2013487.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "for the multiple depot vehicle scheduling problem , Transp . Res . Part B Methodol . 118 ( 2018 ) 457 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", - "start_index": 0 - }, - { - "text": "for the multiple depot vehicle scheduling problem , Transp . Res . Part B Methodol . 118 ( 2018 ) 457 \u2013 487 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref3", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "f60e59177f5f0e53e3f285fa68a8e3ef", - "text": "[4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of \ufb01ve heuristics for the multiple depot vehicle scheduling", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "A . S . Pepin , G . Desaulniers , A . Hertz , D . Huisman , A comparison of \ufb01ve heuristics for the multiple depot vehicle scheduling", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref4", - "start_index": 4 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "0f8229a10050ec65ae5b6f9f66c6ca47", - "text": "problem, J. Sched. 12 (1) (2009) 17.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "problem , J . Sched . 12 ( 1 ) ( 2009 ) 17 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref4", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "9f411677c0a8ddb06047e600b348e282", - "text": "[5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1)", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "C . C . Ribeiro , F . Soumis , A column generation approach to the multiple - depot vehicle scheduling problem , Oper . Res . 42 ( 1 )", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", - "start_index": 4 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "e37f78c7271830eb805f560368fec7cc", - "text": "(1994) 41\u201352.", - "metadata": { - "languages": [ - "eng" - ], - "links": [ - { - "text": "( 1994 ) 41 \u2013", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", - "start_index": 0 - }, - { - "text": "( 1994 ) 41 \u2013 52 .", - "url": "http://refhub.elsevier.com/S2352-3409(18)31594-4/sbref5", - "start_index": 0 - } - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "94e316e08a4a19eed59d29d5d58703ce", - "text": "487", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 4, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-api/75/29/main.PMC6312793.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json deleted file mode 100644 index ed4d55b17c..0000000000 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json +++ /dev/null @@ -1,310 +0,0 @@ -[ - { - "type": "Header", - "element_id": "13c2cd4a987063cb9fe6802f8d9d8bba", - "text": "S32", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "6e95de55fbc805ac11d5e168881e41eb", - "text": "ns; 40 mg/day=3.6%, p<0.05; 80 mg/day=4.9%, p<0.01; 120 mg/day=9.3%, p<0.001, PM dosing group: 20 mg/day=-0.4%, ns; 40 mg/day=2.8%, p<0.05: 80 mg/day=0.2%, ns; 160 mg/day=5.8%, p<0.05). There was no clear dose-dependent trend associated with nausea and RD was similar between AM and PM dosing group (AM dosing group: 20 mg/ day=0.2% ns; 40 mg/day=3.8%, p<0.05; 80 mg/day=3.8%, ns; 120 mg/ day=6.6%, ns, PM dosing group: 20 mg/day=-1.6%, ns; 40 mg/day=-1.7%, ns; 80 mg/day=5.5%, p<0.01; 160 mg/day=2.8%, ns). Discussion: The risk of adverse events in the treatment of schizophrenia with lurasidone can vary depending on the timing of administration. In particular, for akathisia and somnolence, the incidence risks were reduced when lurasidone was administered in PM. Unlike with AM administration, the dose-dependence in the risks of these adverse events were not observed in lurasidone PM administration. The timing of lurasidone administration could be considered in effort to minimize potential adverse events.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "c0ad446ac0e663713724aa5f42d20448", - "text": "S6. SLEEP ENDOPHENOTYPES OF SCHIZOPHRENIA: A HIGH-DENSITY EEG STUDY IN DRUG-NA\u00cfVE, FIRST EPISODE PSYCHOSIS PATIENTS", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "21facf77763c3e990a3db1b8626c133a", - "text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D\u2019Agostino*3 1Faculty of Biomedical Sciences, Universit\u00e0 della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King\u2019s College London, England; 3Universit\u00e0 degli Studi di Milano, Italy", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "26b6989522e94c2c7ef5c2633e41cf72", - "text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high\u2013density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1\u20134 Hz) was lower in FEP compared to HC but this difference didn\u2019t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Footer", - "element_id": "b38798d4ed1cda1c49ed2db924d40039", - "text": "SIRS 2020 Abstracts", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "6681a3fc2e2bbc7efabbf221baaeec6b", - "text": "Poster Session I", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "418368d1fe238e68fc6c8663f7485649", - "text": "Discussion: Our data confirm previous findings on reduced slow wave density in FEP, and expand them to acute subjects, before any treatment is prescribed. This is in line with available data on diffuse abnormalities of cortico-cortical and cortico-thalamic networks in these patients. Interestingly, our data also offer preliminary evidence that this deficit is specific for SCZ, as it appears to differentiate patients who developed SCZ from those with other diagnoses at follow-up. Given the traveling properties of slow waves, future research should establish their potential as markers of connectivity in SCZ.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "2693595cd6fc5be02dc752b089f85eea", - "text": "S7. INVESTIGATING THE LINK BETWEEN THE PERIPHERAL ENDOCANNABINOID SYSTEM AND CENTRAL GLUTAMATERGIC NEUROTRANSMISSION IN EARLY PSYCHOSIS: A 7T-MRS STUDY", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "3f2d8de4445801a7562416267c06a877", - "text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "741c946db28df5068fb60063dad37d27", - "text": "Background: Meta-analytic evidence showed increased levels of periph- eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their first non-affective psychotic episode (NA-FEP) and healthy controls. Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session. Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP Discussion: This study will help clarifying the contribution of peripheral endocannabinoid system to central brain mechanisms of key relevance for psychotic illness. It will also add further evidence on the limited literature on high-resolution characterisation of brain metabolites in early psychosis. Strengths of the study include: (i) use of high-field MRS, which allows the estimation of glutamate-related compounds at higher precision than at lower field strength; (ii) reduced heterogeneity of the clinical sample (only male and NA-FEP). Limitations: small sample size and cross-sectional design.", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "c1543aee0d7efb59052757f7b83a70a9", - "text": "S8. GRIN1 PROMOTER METHYLATION CHANGES IN BLOOD OF EARLY-ONSET PSYCHOTIC PATIENTS AND UNAFFECTED SIBLINGS WITH CHILDHOOD TRAUMA", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "5afb27a02de3e7a95c0f2fa442e32526", - "text": "Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2,", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "0d80b62dd72121dd5263df8605849cf4", - "text": "AQ3", - "metadata": { - "languages": [ - "eng" - ], - "page_number": 1, - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/biomed-path/07/07/sbaa031.073.PMC7234218.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json b/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json index b2b5163a55..4ab84f68e4 100644 --- a/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json +++ b/test_unstructured_ingest/expected-structured-output/s3-minio/wiki_movie_plots_small.csv.json @@ -2,9 +2,9 @@ { "type": "Table", "element_id": "dd7ef5654ad25579067c5f95d3515acf", - "text": "Release Year Title Origin/Ethnicity Director Cast Genre Wiki Page Plot 1901 Kansas Saloon Smashers American Unknown unknown https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1] 1901 Love by the Light of the Moon American Unknown unknown https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better. 1901 The Martyred Presidents American Unknown unknown https://en.wikipedia.org/wiki/The_Martyred_Presidents The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents\u2014Abraham Lincoln, James A. Garfield, and William McKinley\u2014each victims of assassination.\\r\\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice. 1901 Terrible Teddy, the Grizzly King American Unknown unknown https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading \"His Photographer\" and \"His Press Agent\" respectively, follow him into the shot; the photographer sets up his camera. \"Teddy\" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. \"Teddy\" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. \"Teddy\" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs. 1902 Jack and the Beanstalk American George S. Fleming, Edwin S. Porter unknown https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film) The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince. 1903 Alice in Wonderland American Cecil Hepworth May Clark unknown https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film) Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"\\r\\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.\\r\\nThe Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream. 1903 The Great Train Robbery American Edwin S. Porter western https://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film) The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the bandits\u200d\u2014\u200cnow four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.\\r\\nMeanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.\\r\\nThere is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail. 1904 The Suburbanite American Wallace McCutcheon comedy https://en.wikipedia.org/wiki/The_Suburbanite The film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest. 1905 The Little Train Robbery American Edwin Stanton Porter unknown https://en.wikipedia.org/wiki/The_Little_Train_Robbery The opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The \"Bandit Queen,\" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The \"Bandit Queen\" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\\r\\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the \"valuables,\" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\\r\\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the \"plunder.\" The police, however, have struck the right trail and are in close pursuit. While the \"plunder\" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the \"Bandit Queen.\" 1905 The Night Before Christmas American Edwin Stanton Porter unknown https://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film) Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents. 1906 Dream of a Rarebit Fiend American Wallace McCutcheon and Edwin S. Porter short https://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film) The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed. 1906 From Leadville to Aspen: A Hold-Up in the Rockies American Francis J. Marion and Wallace McCutcheon short action/crime western https://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_Rockies The film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart. 1906 Kathleen Mavourneen American Edwin S. Porter short film https://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film) Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\\r\\nFilm historian Charles Musser writes of Porter's adaptation, \"O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.\"[1] 1907 Daniel Boone American Wallace McCutcheon and Ediwin S. Porter William Craven, Florence Lawrence biographical https://en.wikipedia.org/wiki/Daniel_Boone_(1907_film) Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2] 1907 How Brown Saw the Baseball Game American Unknown Unknown comedy https://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_Game Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1] 1907 Laughing Gas American Edwin Stanton Porter Bertha Regustus, Edward Boulden comedy https://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_Film The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers. 1908 The Adventures of Dollie American D. W. Griffith Arthur V. Johnson, Linda Arvidson drama https://en.wikipedia.org/wiki/The_Adventures_of_Dollie On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents. 1908 The Black Viper American D. W. Griffith D. W. Griffith drama https://en.wikipedia.org/wiki/The_Black_Viper A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house. 1908 A Calamitous Elopement American D.W. Griffith Harry Solter, Linda Arvidson comedy https://en.wikipedia.org/wiki/A_Calamitous_Elopement A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings. 1908 The Call of the Wild American D. W. Griffith Charles Inslee adventure https://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film) A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\" 1908 A Christmas Carol American Unknown Tom Ricketts drama https://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film) No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life. 1908 The Fight for Freedom American D. W. Griffith Florence Auer, John G. Adolfi western https://en.wikipedia.org/wiki/The_Fight_for_Freedom The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.", + "text": "Release Year Title Origin/Ethnicity Director Cast Genre Wiki Page Plot 1901 Kansas Saloon Smashers American Unknown unknown https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1] 1901 Love by the Light of the Moon American Unknown unknown https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better. 1901 The Martyred Presidents American Unknown unknown https://en.wikipedia.org/wiki/The_Martyred_Presidents The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\\r\\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice. 1901 Terrible Teddy, the Grizzly King American Unknown unknown https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading \"His Photographer\" and \"His Press Agent\" respectively, follow him into the shot; the photographer sets up his camera. \"Teddy\" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. \"Teddy\" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. \"Teddy\" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs. 1902 Jack and the Beanstalk American George S. Fleming, Edwin S. Porter unknown https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film) The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince. 1903 Alice in Wonderland American Cecil Hepworth May Clark unknown https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film) Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"\\r\\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.\\r\\nThe Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream. 1903 The Great Train Robbery American Edwin S. Porter western https://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film) The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the bandits‍—‌now four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.\\r\\nMeanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.\\r\\nThere is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail. 1904 The Suburbanite American Wallace McCutcheon comedy https://en.wikipedia.org/wiki/The_Suburbanite The film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest. 1905 The Little Train Robbery American Edwin Stanton Porter unknown https://en.wikipedia.org/wiki/The_Little_Train_Robbery The opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The \"Bandit Queen,\" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The \"Bandit Queen\" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\\r\\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the \"valuables,\" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\\r\\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the \"plunder.\" The police, however, have struck the right trail and are in close pursuit. While the \"plunder\" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the \"Bandit Queen.\" 1905 The Night Before Christmas American Edwin Stanton Porter unknown https://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film) Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents. 1906 Dream of a Rarebit Fiend American Wallace McCutcheon and Edwin S. Porter short https://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film) The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed. 1906 From Leadville to Aspen: A Hold-Up in the Rockies American Francis J. Marion and Wallace McCutcheon short action/crime western https://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_Rockies The film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart. 1906 Kathleen Mavourneen American Edwin S. Porter short film https://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film) Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\\r\\nFilm historian Charles Musser writes of Porter's adaptation, \"O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.\"[1] 1907 Daniel Boone American Wallace McCutcheon and Ediwin S. Porter William Craven, Florence Lawrence biographical https://en.wikipedia.org/wiki/Daniel_Boone_(1907_film) Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2] 1907 How Brown Saw the Baseball Game American Unknown Unknown comedy https://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_Game Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1] 1907 Laughing Gas American Edwin Stanton Porter Bertha Regustus, Edward Boulden comedy https://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_Film The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers. 1908 The Adventures of Dollie American D. W. Griffith Arthur V. Johnson, Linda Arvidson drama https://en.wikipedia.org/wiki/The_Adventures_of_Dollie On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents. 1908 The Black Viper American D. W. Griffith D. W. Griffith drama https://en.wikipedia.org/wiki/The_Black_Viper A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house. 1908 A Calamitous Elopement American D.W. Griffith Harry Solter, Linda Arvidson comedy https://en.wikipedia.org/wiki/A_Calamitous_Elopement A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings. 1908 The Call of the Wild American D. W. Griffith Charles Inslee adventure https://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film) A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\" 1908 A Christmas Carol American Unknown Tom Ricketts drama https://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film) No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life. 1908 The Fight for Freedom American D. W. Griffith Florence Auer, John G. Adolfi western https://en.wikipedia.org/wiki/The_Fight_for_Freedom The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.", "metadata": { - "text_as_html": "
    Release YearTitleOrigin/EthnicityDirectorCastGenreWiki PagePlot
    1901Kansas Saloon SmashersAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Kansas_Saloon_SmashersA bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]
    1901Love by the Light of the MoonAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Love_by_the_Light_of_the_MoonThe moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better.
    1901The Martyred PresidentsAmericanUnknownunknownhttps://en.wikipedia.org/wiki/The_Martyred_PresidentsThe film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents\u2014Abraham Lincoln, James A. Garfield, and William McKinley\u2014each victims of assassination.\\r\\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice.
    1901Terrible Teddy, the Grizzly KingAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_KingLasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading \"His Photographer\" and \"His Press Agent\" respectively, follow him into the shot; the photographer sets up his camera. \"Teddy\" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. \"Teddy\" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. \"Teddy\" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs.
    1902Jack and the BeanstalkAmericanGeorge S. Fleming, Edwin S. Porterunknownhttps://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film)The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince.
    1903Alice in WonderlandAmericanCecil HepworthMay Clarkunknownhttps://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film)Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"\\r\\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.\\r\\nThe Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream.
    1903The Great Train RobberyAmericanEdwin S. Porterwesternhttps://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film)The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the bandits\u200d\u2014\u200cnow four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.\\r\\nMeanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.\\r\\nThere is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail.
    1904The SuburbaniteAmericanWallace McCutcheoncomedyhttps://en.wikipedia.org/wiki/The_SuburbaniteThe film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest.
    1905The Little Train RobberyAmericanEdwin Stanton Porterunknownhttps://en.wikipedia.org/wiki/The_Little_Train_RobberyThe opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The \"Bandit Queen,\" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The \"Bandit Queen\" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\\r\\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the \"valuables,\" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\\r\\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the \"plunder.\" The police, however, have struck the right trail and are in close pursuit. While the \"plunder\" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the \"Bandit Queen.\"
    1905The Night Before ChristmasAmericanEdwin Stanton Porterunknownhttps://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film)Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents.
    1906Dream of a Rarebit FiendAmericanWallace McCutcheon and Edwin S. Portershorthttps://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film)The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed.
    1906From Leadville to Aspen: A Hold-Up in the RockiesAmericanFrancis J. Marion and Wallace McCutcheonshort action/crime westernhttps://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_RockiesThe film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart.
    1906Kathleen MavourneenAmericanEdwin S. Portershort filmhttps://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film)Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\\r\\nFilm historian Charles Musser writes of Porter's adaptation, \"O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.\"[1]
    1907Daniel BooneAmericanWallace McCutcheon and Ediwin S. PorterWilliam Craven, Florence Lawrencebiographicalhttps://en.wikipedia.org/wiki/Daniel_Boone_(1907_film)Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2]
    1907How Brown Saw the Baseball GameAmericanUnknownUnknowncomedyhttps://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_GameBefore heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1]
    1907Laughing GasAmericanEdwin Stanton PorterBertha Regustus, Edward Bouldencomedyhttps://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_FilmThe plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers.
    1908The Adventures of DollieAmericanD. W. GriffithArthur V. Johnson, Linda Arvidsondramahttps://en.wikipedia.org/wiki/The_Adventures_of_DollieOn a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents.
    1908The Black ViperAmericanD. W. GriffithD. W. Griffithdramahttps://en.wikipedia.org/wiki/The_Black_ViperA thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house.
    1908A Calamitous ElopementAmericanD.W. GriffithHarry Solter, Linda Arvidsoncomedyhttps://en.wikipedia.org/wiki/A_Calamitous_ElopementA young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings.
    1908The Call of the WildAmericanD. W. GriffithCharles Insleeadventurehttps://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film)A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\"
    1908A Christmas CarolAmericanUnknownTom Rickettsdramahttps://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film)No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life.
    1908The Fight for FreedomAmericanD. W. GriffithFlorence Auer, John G. Adolfiwesternhttps://en.wikipedia.org/wiki/The_Fight_for_FreedomThe film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.
    ", + "text_as_html": "
    Release YearTitleOrigin/EthnicityDirectorCastGenreWiki PagePlot
    1901Kansas Saloon SmashersAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Kansas_Saloon_SmashersA bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]
    1901Love by the Light of the MoonAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Love_by_the_Light_of_the_MoonThe moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better.
    1901The Martyred PresidentsAmericanUnknownunknownhttps://en.wikipedia.org/wiki/The_Martyred_PresidentsThe film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\\r\\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice.
    1901Terrible Teddy, the Grizzly KingAmericanUnknownunknownhttps://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_KingLasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading \"His Photographer\" and \"His Press Agent\" respectively, follow him into the shot; the photographer sets up his camera. \"Teddy\" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. \"Teddy\" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. \"Teddy\" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs.
    1902Jack and the BeanstalkAmericanGeorge S. Fleming, Edwin S. Porterunknownhttps://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film)The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince.
    1903Alice in WonderlandAmericanCecil HepworthMay Clarkunknownhttps://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film)Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"\\r\\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.\\r\\nThe Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream.
    1903The Great Train RobberyAmericanEdwin S. Porterwesternhttps://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film)The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the bandits‍—‌now four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.\\r\\nMeanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.\\r\\nThere is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail.
    1904The SuburbaniteAmericanWallace McCutcheoncomedyhttps://en.wikipedia.org/wiki/The_SuburbaniteThe film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest.
    1905The Little Train RobberyAmericanEdwin Stanton Porterunknownhttps://en.wikipedia.org/wiki/The_Little_Train_RobberyThe opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The \"Bandit Queen,\" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The \"Bandit Queen\" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\\r\\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the \"valuables,\" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\\r\\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the \"plunder.\" The police, however, have struck the right trail and are in close pursuit. While the \"plunder\" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the \"Bandit Queen.\"
    1905The Night Before ChristmasAmericanEdwin Stanton Porterunknownhttps://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film)Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents.
    1906Dream of a Rarebit FiendAmericanWallace McCutcheon and Edwin S. Portershorthttps://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film)The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed.
    1906From Leadville to Aspen: A Hold-Up in the RockiesAmericanFrancis J. Marion and Wallace McCutcheonshort action/crime westernhttps://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_RockiesThe film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart.
    1906Kathleen MavourneenAmericanEdwin S. Portershort filmhttps://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film)Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\\r\\nFilm historian Charles Musser writes of Porter's adaptation, \"O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.\"[1]
    1907Daniel BooneAmericanWallace McCutcheon and Ediwin S. PorterWilliam Craven, Florence Lawrencebiographicalhttps://en.wikipedia.org/wiki/Daniel_Boone_(1907_film)Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2]
    1907How Brown Saw the Baseball GameAmericanUnknownUnknowncomedyhttps://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_GameBefore heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1]
    1907Laughing GasAmericanEdwin Stanton PorterBertha Regustus, Edward Bouldencomedyhttps://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_FilmThe plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers.
    1908The Adventures of DollieAmericanD. W. GriffithArthur V. Johnson, Linda Arvidsondramahttps://en.wikipedia.org/wiki/The_Adventures_of_DollieOn a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents.
    1908The Black ViperAmericanD. W. GriffithD. W. Griffithdramahttps://en.wikipedia.org/wiki/The_Black_ViperA thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house.
    1908A Calamitous ElopementAmericanD.W. GriffithHarry Solter, Linda Arvidsoncomedyhttps://en.wikipedia.org/wiki/A_Calamitous_ElopementA young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings.
    1908The Call of the WildAmericanD. W. GriffithCharles Insleeadventurehttps://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film)A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\"
    1908A Christmas CarolAmericanUnknownTom Rickettsdramahttps://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film)No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life.
    1908The Fight for FreedomAmericanD. W. GriffithFlorence Auer, John G. Adolfiwesternhttps://en.wikipedia.org/wiki/The_Fight_for_FreedomThe film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.
    ", "languages": [ "eng" ], diff --git a/test_unstructured_ingest/src/against-api.sh b/test_unstructured_ingest/src/against-api.sh index 7f2d6a9446..9667e01a1d 100755 --- a/test_unstructured_ingest/src/against-api.sh +++ b/test_unstructured_ingest/src/against-api.sh @@ -37,11 +37,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --chunking-strategy by_page \ --chunk-max-characters 10000 \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --num-processes "$max_processes" \ --input-path "example-docs/pdf/$TEST_FILE_NAME" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" RESULT_FILE_PATH="$OUTPUT_DIR/$TEST_FILE_NAME.json" # validate that there is at least one table with text_as_html in the results diff --git a/test_unstructured_ingest/src/airtable-diff.sh b/test_unstructured_ingest/src/airtable-diff.sh index 3cd81eff77..3fd3005c29 100755 --- a/test_unstructured_ingest/src/airtable-diff.sh +++ b/test_unstructured_ingest/src/airtable-diff.sh @@ -45,8 +45,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --work-dir "$WORK_DIR" \ - --verbose + --verbose \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/airtable-large.sh b/test_unstructured_ingest/src/airtable-large.sh index c0bf06fe4e..0e60199591 100755 --- a/test_unstructured_ingest/src/airtable-large.sh +++ b/test_unstructured_ingest/src/airtable-large.sh @@ -48,8 +48,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" # We are expecting fifteen directories: fourteen bases and the parent directory "$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME" diff --git a/test_unstructured_ingest/src/astradb.sh b/test_unstructured_ingest/src/astradb.sh index 1ea211a6bb..1b7843be49 100755 --- a/test_unstructured_ingest/src/astradb.sh +++ b/test_unstructured_ingest/src/astradb.sh @@ -34,8 +34,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/azure.sh b/test_unstructured_ingest/src/azure.sh index 6744805d6b..9c64353e9a 100755 --- a/test_unstructured_ingest/src/azure.sh +++ b/test_unstructured_ingest/src/azure.sh @@ -30,11 +30,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --account-name azureunstructured1 \ --remote-url abfs://container1/ \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/biomed-api.sh b/test_unstructured_ingest/src/biomed-api.sh index 82b29f887a..d8a0c6001e 100755 --- a/test_unstructured_ingest/src/biomed-api.sh +++ b/test_unstructured_ingest/src/biomed-api.sh @@ -33,12 +33,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --preserve-downloads \ --re-download \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --api-from "2019-01-02" \ --api-until "2019-01-02+00:03:10" \ --max-request-time 30 \ --max-retries 5 \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/biomed-path.sh b/test_unstructured_ingest/src/biomed-path.sh index 12401ed8ab..b45106b506 100755 --- a/test_unstructured_ingest/src/biomed-path.sh +++ b/test_unstructured_ingest/src/biomed-path.sh @@ -32,11 +32,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --max-request-time 30 \ --max-retries 5 \ --path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/box.sh b/test_unstructured_ingest/src/box.sh index 3ab2f44b46..6ff5a3dc96 100755 --- a/test_unstructured_ingest/src/box.sh +++ b/test_unstructured_ingest/src/box.sh @@ -39,18 +39,21 @@ if [ -z "$BOX_APP_CONFIG_PATH" ]; then fi RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} + +# shellcheck disable=SC2046 PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ box \ --download-dir "$DOWNLOAD_DIR" \ - --box-app-config "$BOX_APP_CONFIG_PATH" \ + --box-app-config $(cat "$BOX_APP_CONFIG_PATH") \ --remote-url box://utic-test-ingest-fixtures \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --num-processes "$max_processes" \ --preserve-downloads \ --recursive \ --reprocess \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/confluence-diff.sh b/test_unstructured_ingest/src/confluence-diff.sh index dc0f71cd12..f69f90f4e8 100755 --- a/test_unstructured_ingest/src/confluence-diff.sh +++ b/test_unstructured_ingest/src/confluence-diff.sh @@ -39,12 +39,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --url https://unstructured-ingest-test.atlassian.net \ --user-email "$CONFLUENCE_USER_EMAIL" \ --api-token "$CONFLUENCE_API_TOKEN" \ --spaces testteamsp,MFS \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/confluence-large.sh b/test_unstructured_ingest/src/confluence-large.sh index 790d675b9d..41ac1e3c46 100755 --- a/test_unstructured_ingest/src/confluence-large.sh +++ b/test_unstructured_ingest/src/confluence-large.sh @@ -45,7 +45,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --url https://unstructured-ingest-test.atlassian.net \ --user-email "$CONFLUENCE_USER_EMAIL" \ @@ -53,7 +52,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --max-num-of-spaces 10 \ --spaces testteamsp1 \ --max-num-of-docs-from-each-space 250 \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" OUTPUT_SUBFOLDER_NAME=testteamsp1 diff --git a/test_unstructured_ingest/src/delta-table.sh b/test_unstructured_ingest/src/delta-table.sh index d8ac971456..6fcf0c8cd0 100755 --- a/test_unstructured_ingest/src/delta-table.sh +++ b/test_unstructured_ingest/src/delta-table.sh @@ -38,10 +38,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.date_created,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --download-dir "$DOWNLOAD_DIR" \ --table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \ - --output-dir "$OUTPUT_DIR" \ --storage_options "{\"AWS_REGION\":\"us-east-2\",\"AWS_ACCESS_KEY_ID\":\"$AWS_ACCESS_KEY_ID\",\"AWS_SECRET_ACCESS_KEY\":\"$AWS_SECRET_ACCESS_KEY\"}" \ --preserve-downloads \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/discord.sh b/test_unstructured_ingest/src/discord.sh index ca986e3b0a..e074a145d8 100755 --- a/test_unstructured_ingest/src/discord.sh +++ b/test_unstructured_ingest/src/discord.sh @@ -37,10 +37,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --download-dir "$DOWNLOAD_DIR" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --channels 1099442333440802930,1099601456321003600 \ --token "$DISCORD_TOKEN" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/dropbox.sh b/test_unstructured_ingest/src/dropbox.sh index ff2c82998f..5d53e11c57 100755 --- a/test_unstructured_ingest/src/dropbox.sh +++ b/test_unstructured_ingest/src/dropbox.sh @@ -42,11 +42,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --token "$DROPBOX_ACCESS_TOKEN" \ --recursive \ --remote-url "dropbox://test-input/" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/elasticsearch.sh b/test_unstructured_ingest/src/elasticsearch.sh index 9141cde57f..2596eefabd 100755 --- a/test_unstructured_ingest/src/elasticsearch.sh +++ b/test_unstructured_ingest/src/elasticsearch.sh @@ -45,7 +45,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --index-name movies \ --hosts http://localhost:9200 \ @@ -53,6 +52,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --password "$ELASTIC_PASSWORD" \ --fields 'ethnicity,director,plot' \ --work-dir "$WORK_DIR" \ - --batch-size 2 + --batch-size 2 \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/gcs.sh b/test_unstructured_ingest/src/gcs.sh index 5261c11697..a3dec286ca 100755 --- a/test_unstructured_ingest/src/gcs.sh +++ b/test_unstructured_ingest/src/gcs.sh @@ -42,11 +42,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \ --recursive \ --remote-url gs://utic-test-ingest-fixtures/ \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/github.sh b/test_unstructured_ingest/src/github.sh index bea75f3590..87158195d3 100755 --- a/test_unstructured_ingest/src/github.sh +++ b/test_unstructured_ingest/src/github.sh @@ -29,7 +29,7 @@ GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none} ACCESS_TOKEN_FLAGS="" # to update test fixtures, "export OVERWRITE_FIXTURES=true" and rerun this script if [[ "$GH_READ_ONLY_ACCESS_TOKEN" != "none" ]]; then - ACCESS_TOKEN_FLAGS="--git-access-token $GH_READ_ONLY_ACCESS_TOKEN" + ACCESS_TOKEN_FLAGS="--access-token $GH_READ_ONLY_ACCESS_TOKEN" elif [[ "$CI" == "true" ]]; then echo "Warning: GH_READ_ONLY_ACCESS_TOKEN is not defined in the CI environment." echo "This can lead to intermittent failures in test-ingest-github.sh, as non-auth'ed" @@ -47,11 +47,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --url dcneiner/Downloadify \ - --git-file-glob '*.html,*.txt' \ + --file-glob '*.html,*.txt' \ --work-dir "$WORK_DIR" \ - $ACCESS_TOKEN_FLAGS + $ACCESS_TOKEN_FLAGS \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/gitlab.sh b/test_unstructured_ingest/src/gitlab.sh index 1bd01b4882..4bbed043fc 100755 --- a/test_unstructured_ingest/src/gitlab.sh +++ b/test_unstructured_ingest/src/gitlab.sh @@ -33,11 +33,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --git-branch 'v0.0.7' \ --git-file-glob '*.md,*.txt' \ --url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/google-drive.sh b/test_unstructured_ingest/src/google-drive.sh index 7e580e8a19..a1bc46d3a4 100755 --- a/test_unstructured_ingest/src/google-drive.sh +++ b/test_unstructured_ingest/src/google-drive.sh @@ -44,13 +44,14 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --drive-id 1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr \ --service-account-key-path "$GCP_INGEST_SERVICE_KEY_FILE" \ --recursive \ --extensions "pdf,docx" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/hubspot.sh b/test_unstructured_ingest/src/hubspot.sh index d5b617569a..d4ed043c3c 100755 --- a/test_unstructured_ingest/src/hubspot.sh +++ b/test_unstructured_ingest/src/hubspot.sh @@ -45,12 +45,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ - --output-dir "$OUTPUT_DIR" \ --api-token "$HUBSPOT_API_TOKEN" \ --object-types "calls,communications,emails,notes,products,tickets" \ --custom-properties '{"products":["my_custom_property"],"tickets":["another_custom_property"]}' \ --work-dir "$WORK_DIR" \ --preserve-downloads \ - --verbose + --verbose \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/jira.sh b/test_unstructured_ingest/src/jira.sh index ce6b4e0494..8e11647b71 100755 --- a/test_unstructured_ingest/src/jira.sh +++ b/test_unstructured_ingest/src/jira.sh @@ -58,7 +58,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --url https://unstructured-jira-connector-test.atlassian.net \ --user-email "$JIRA_INGEST_USER_EMAIL" \ @@ -66,6 +65,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --projects "JCTP3" \ --boards "1" \ --issues "JCTP2-4,JCTP2-7,JCTP2-8,10012,JCTP2-11" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/kafka-local.sh b/test_unstructured_ingest/src/kafka-local.sh index 36b21754fa..9e78fba544 100755 --- a/test_unstructured_ingest/src/kafka-local.sh +++ b/test_unstructured_ingest/src/kafka-local.sh @@ -67,10 +67,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --port 29092 \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --work-dir "$WORK_DIR" \ - --confluent false + --confluent false \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local-embed-bedrock.sh b/test_unstructured_ingest/src/local-embed-bedrock.sh index 285d15a56c..da4ee60c46 100755 --- a/test_unstructured_ingest/src/local-embed-bedrock.sh +++ b/test_unstructured_ingest/src/local-embed-bedrock.sh @@ -29,14 +29,15 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ --embedding-provider "aws-bedrock" \ --embedding-aws-access-key-id "$AWS_ACCESS_KEY_ID" \ - --embedding-aws-secret-access-key "$AWS_SECRET_ACCESS_KEY" + --embedding-aws-secret-access-key "$AWS_SECRET_ACCESS_KEY" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-embed-mixedbreadai.sh b/test_unstructured_ingest/src/local-embed-mixedbreadai.sh index 99168d7ddc..91823d0e9a 100755 --- a/test_unstructured_ingest/src/local-embed-mixedbreadai.sh +++ b/test_unstructured_ingest/src/local-embed-mixedbreadai.sh @@ -28,14 +28,15 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.record_locator.path,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ --embedding-provider "mixedbread-ai" \ --embedding-api-key "$MXBAI_API_KEY" \ - --embedding-model-name "mixedbread-ai/mxbai-embed-large-v1" + --embedding-model-name "mixedbread-ai/mxbai-embed-large-v1" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-embed-octoai.sh b/test_unstructured_ingest/src/local-embed-octoai.sh index 54ff3e2a08..92291ae8db 100755 --- a/test_unstructured_ingest/src/local-embed-octoai.sh +++ b/test_unstructured_ingest/src/local-embed-octoai.sh @@ -30,13 +30,14 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ --embedding-provider "octoai" \ - --embedding-api-key "$OCTOAI_API_KEY" + --embedding-api-key "$OCTOAI_API_KEY" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-embed-vertexai.sh b/test_unstructured_ingest/src/local-embed-vertexai.sh index 4ef499bc5b..a83dd798f2 100755 --- a/test_unstructured_ingest/src/local-embed-vertexai.sh +++ b/test_unstructured_ingest/src/local-embed-vertexai.sh @@ -30,14 +30,15 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ --embedding-provider "vertexai" \ --embedding-api-key "$GCP_INGEST_SERVICE_KEY" \ - --embedding-model-name "textembedding-gecko@001" + --embedding-model-name "textembedding-gecko@001" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-embed-voyageai.sh b/test_unstructured_ingest/src/local-embed-voyageai.sh index 83fe3586a4..7eea0c9e0e 100755 --- a/test_unstructured_ingest/src/local-embed-voyageai.sh +++ b/test_unstructured_ingest/src/local-embed-voyageai.sh @@ -30,14 +30,15 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ --embedding-provider "voyageai" \ --embedding-api-key "$VOYAGE_API_KEY" \ - --embedding-model-name "voyage-3-large" + --embedding-model-name "voyage-3-large" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-embed.sh b/test_unstructured_ingest/src/local-embed.sh index 210a7111c2..3d25844095 100755 --- a/test_unstructured_ingest/src/local-embed.sh +++ b/test_unstructured_ingest/src/local-embed.sh @@ -24,12 +24,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ - --embedding-provider "huggingface" + --embedding-provider "huggingface" \ + local \ + --output-dir "$OUTPUT_DIR" set +e diff --git a/test_unstructured_ingest/src/local-failed-partition.sh b/test_unstructured_ingest/src/local-failed-partition.sh index a230888b30..976693433b 100755 --- a/test_unstructured_ingest/src/local-failed-partition.sh +++ b/test_unstructured_ingest/src/local-failed-partition.sh @@ -45,9 +45,10 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --strategy fast \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --input-path "$SCRIPT_DIR"/failed-partition-docs \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" check diff --git a/test_unstructured_ingest/src/local-single-file-basic-chunking.sh b/test_unstructured_ingest/src/local-single-file-basic-chunking.sh index 575bd876f8..12da9e1dde 100755 --- a/test_unstructured_ingest/src/local-single-file-basic-chunking.sh +++ b/test_unstructured_ingest/src/local-single-file-basic-chunking.sh @@ -31,10 +31,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --input-path "$ABS_INPUT_PATH" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ --reprocess \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh index 051c5fba29..fc8b0a41df 100755 --- a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh +++ b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh @@ -43,10 +43,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --input-path "$ABS_INPUT_PATH" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ --reprocess \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local-single-file-with-encoding.sh b/test_unstructured_ingest/src/local-single-file-with-encoding.sh index 3cf91223e5..9034abcfbd 100755 --- a/test_unstructured_ingest/src/local-single-file-with-encoding.sh +++ b/test_unstructured_ingest/src/local-single-file-with-encoding.sh @@ -25,12 +25,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --encoding cp1252 \ --verbose \ --reprocess \ --input-path example-docs/fake-html-cp1252.html \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh b/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh index 4c0ab5b36d..1597ffe83a 100755 --- a/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh +++ b/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh @@ -25,13 +25,14 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --skip-infer-table-types "xls,xlsx" \ --strategy hi_res \ --verbose \ --reprocess \ --input-path "$SCRIPT_DIR"/example-docs/ \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local-single-file.sh b/test_unstructured_ingest/src/local-single-file.sh index 249746ed8a..d39cccc8c3 100755 --- a/test_unstructured_ingest/src/local-single-file.sh +++ b/test_unstructured_ingest/src/local-single-file.sh @@ -27,12 +27,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --output-dir "$OUTPUT_DIR" \ --additional-partition-args '{"strategy":"ocr_only", "languages":["ind", "est"]}' \ --verbose \ --reprocess \ --input-path "$ABS_INPUT_PATH" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/local.sh b/test_unstructured_ingest/src/local.sh index 3c7139cebe..eb4eed4e6b 100755 --- a/test_unstructured_ingest/src/local.sh +++ b/test_unstructured_ingest/src/local.sh @@ -26,11 +26,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --strategy hi_res \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --file-glob "*.html" \ --input-path example-docs \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 15 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/mongodb.sh b/test_unstructured_ingest/src/mongodb.sh index 8429d7e1fd..a2afdaee88 100755 --- a/test_unstructured_ingest/src/mongodb.sh +++ b/test_unstructured_ingest/src/mongodb.sh @@ -33,7 +33,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ - --output-dir "$OUTPUT_DIR" \ --uri "$MONGODB_URI" \ --database "$MONGODB_DATABASE_NAME" \ --collection "$SOURCE_MONGO_COLLECTION" \ @@ -41,6 +40,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --preserve-downloads \ --reprocess \ --batch-size 2 \ - --verbose + --verbose \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/notion.sh b/test_unstructured_ingest/src/notion.sh index e80a11bfad..91b790f74b 100755 --- a/test_unstructured_ingest/src/notion.sh +++ b/test_unstructured_ingest/src/notion.sh @@ -35,12 +35,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --download-dir "$DOWNLOAD_DIR" \ --notion-api-key "$NOTION_API_KEY" \ - --output-dir "$OUTPUT_DIR" \ --database-ids "122b2c22996b435b9de2ee0e9d2b04bc" \ --num-processes "$max_processes" \ --recursive \ --verbose \ --work-dir "$WORK_DIR" \ - --max-retry-time 30 + --max-retry-time 30 \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/onedrive.sh b/test_unstructured_ingest/src/onedrive.sh index d38b7ab80c..fb4e8e7f51 100755 --- a/test_unstructured_ingest/src/onedrive.sh +++ b/test_unstructured_ingest/src/onedrive.sh @@ -38,7 +38,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --client-cred "$MS_CLIENT_CRED" \ --client-id "$MS_CLIENT_ID" \ @@ -46,6 +45,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --user-pname "$MS_USER_PNAME" \ --path '/utic-test-ingest-fixtures' \ --recursive \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/opensearch.sh b/test_unstructured_ingest/src/opensearch.sh index f1d7c150ed..5d76a8ba2f 100755 --- a/test_unstructured_ingest/src/opensearch.sh +++ b/test_unstructured_ingest/src/opensearch.sh @@ -43,7 +43,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --index-name movies \ --hosts http://localhost:9247 \ @@ -52,6 +51,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --use-ssl \ --fields 'ethnicity,director,plot' \ --work-dir "$WORK_DIR" \ - --batch-size 2 + --batch-size 2 \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/outlook.sh b/test_unstructured_ingest/src/outlook.sh index a1a5a48784..77bfeeb197 100755 --- a/test_unstructured_ingest/src/outlook.sh +++ b/test_unstructured_ingest/src/outlook.sh @@ -37,7 +37,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --client-cred "$MS_CLIENT_CRED" \ --client-id "$MS_CLIENT_ID" \ @@ -45,6 +44,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --user-email "$MS_USER_EMAIL" \ --outlook-folders IntegrationTest \ --recursive \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/pdf-fast-reprocess.sh b/test_unstructured_ingest/src/pdf-fast-reprocess.sh index b27e32e8ef..1f22cab06c 100755 --- a/test_unstructured_ingest/src/pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/src/pdf-fast-reprocess.sh @@ -35,11 +35,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --strategy fast \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --file-glob "*.pdf" \ --input-path "$INPUT_PATH" \ --recursive \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/s3-compression.sh b/test_unstructured_ingest/src/s3-compression.sh index 7ee066f3a3..aded270857 100755 --- a/test_unstructured_ingest/src/s3-compression.sh +++ b/test_unstructured_ingest/src/s3-compression.sh @@ -29,12 +29,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy fast \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/small-pdf-set-w-compression/ \ --anonymous \ --work-dir "$WORK_DIR" \ - --uncompress + --uncompress \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/s3-minio.sh b/test_unstructured_ingest/src/s3-minio.sh index 85dd8f85d0..3a63def407 100755 --- a/test_unstructured_ingest/src/s3-minio.sh +++ b/test_unstructured_ingest/src/s3-minio.sh @@ -42,11 +42,12 @@ AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/ \ --endpoint-url http://localhost:9000 \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/s3.sh b/test_unstructured_ingest/src/s3.sh index bfdc72c1cb..228f2b9b25 100755 --- a/test_unstructured_ingest/src/s3.sh +++ b/test_unstructured_ingest/src/s3.sh @@ -32,11 +32,12 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \ --anonymous \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/salesforce.sh b/test_unstructured_ingest/src/salesforce.sh index 54ebd05558..d726b8e9c0 100755 --- a/test_unstructured_ingest/src/salesforce.sh +++ b/test_unstructured_ingest/src/salesforce.sh @@ -55,8 +55,9 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/sftp.sh b/test_unstructured_ingest/src/sftp.sh index e3312224df..50325902e4 100755 --- a/test_unstructured_ingest/src/sftp.sh +++ b/test_unstructured_ingest/src/sftp.sh @@ -41,12 +41,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.last_modified,metadata.data_source.version \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --recursive \ --username foo \ --password bar \ --remote-url sftp://localhost:47474/upload/ \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/sharepoint-with-permissions.sh b/test_unstructured_ingest/src/sharepoint-with-permissions.sh index cc16c1135c..766fcfd08a 100755 --- a/test_unstructured_ingest/src/sharepoint-with-permissions.sh +++ b/test_unstructured_ingest/src/sharepoint-with-permissions.sh @@ -48,7 +48,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --client-cred "$SHAREPOINT_CRED" \ --client-id "$SHAREPOINT_CLIENT_ID" \ @@ -57,6 +56,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \ --permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \ --recursive \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/sharepoint.sh b/test_unstructured_ingest/src/sharepoint.sh index ea07410d2f..9ac1444252 100755 --- a/test_unstructured_ingest/src/sharepoint.sh +++ b/test_unstructured_ingest/src/sharepoint.sh @@ -40,13 +40,14 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --client-cred "$SHAREPOINT_CRED" \ --client-id "$SHAREPOINT_CLIENT_ID" \ --site "$SHAREPOINT_SITE" \ --recursive \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" set +e "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/slack.sh b/test_unstructured_ingest/src/slack.sh index 503e67240b..0fb4a710e8 100755 --- a/test_unstructured_ingest/src/slack.sh +++ b/test_unstructured_ingest/src/slack.sh @@ -38,12 +38,13 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --strategy hi_res \ --preserve-downloads \ --reprocess \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --channels C07ABKJ83C6 \ --token "${SLACK_TOKEN}" \ --start-date 2023-04-01 \ --end-date 2024-07-01T07:47:00-07:00 \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/wikipedia.sh b/test_unstructured_ingest/src/wikipedia.sh index 21a55e5725..657853c9ab 100755 --- a/test_unstructured_ingest/src/wikipedia.sh +++ b/test_unstructured_ingest/src/wikipedia.sh @@ -32,9 +32,10 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --num-processes "$max_processes" \ --strategy hi_res \ --preserve-downloads \ - --output-dir "$OUTPUT_DIR" \ --verbose \ --page-title "Open Source Software" \ - --work-dir "$WORK_DIR" + --work-dir "$WORK_DIR" \ + local \ + --output-dir "$OUTPUT_DIR" "$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh index 7fca5ede6c..3a0305b781 100755 --- a/test_unstructured_ingest/test-ingest-src.sh +++ b/test_unstructured_ingest/test-ingest-src.sh @@ -18,57 +18,17 @@ EVAL_OUTPUT_ROOT=${EVAL_OUTPUT_ROOT:-$SCRIPT_DIR} export OMP_THREAD_LIMIT=1 all_tests=( - # NOTE(scanny): This test is disabled because it routinely flakes on OCR differencs - # 's3.sh' 's3-minio.sh' 'astradb.sh' 'azure.sh' - 'biomed-api.sh' - 'biomed-path.sh' # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files 'pdf-fast-reprocess.sh' - 'salesforce.sh' - 'box.sh' - 'discord.sh' - 'dropbox.sh' - 'github.sh' - 'gitlab.sh' - 'google-drive.sh' - 'wikipedia.sh' 'local.sh' - # 'slack.sh' - 'against-api.sh' - 'gcs.sh' - 'kafka-local.sh' - #'onedrive.sh' - #'outlook.sh' - 'elasticsearch.sh' - 'confluence-diff.sh' - 'confluence-large.sh' - # NOTE(christine): This test is disabled because it is triggering 404 client errors to the API - # 'airtable-diff.sh' - # # NOTE(ryan): This test is disabled because it is triggering too many requests to the API - # 'airtable-large.sh' 'local-single-file.sh' 'local-single-file-basic-chunking.sh' 'local-single-file-chunk-no-orig-elements.sh' 'local-single-file-with-encoding.sh' 'local-single-file-with-pdf-infer-table-structure.sh' - 'notion.sh' - 'delta-table.sh' - 'jira.sh' - # 'sharepoint.sh' - # 'sharepoint-with-permissions.sh' - 'hubspot.sh' - 'local-embed.sh' - 'local-embed-bedrock.sh' - 'local-embed-octoai.sh' - 'local-embed-vertexai.sh' - 'local-embed-voyageai.sh' - 'local-embed-mixedbreadai.sh' - 'sftp.sh' - 'opensearch.sh' - 'mongodb.sh' ) full_python_matrix_tests=( @@ -79,8 +39,6 @@ full_python_matrix_tests=( 'local-single-file-with-pdf-infer-table-structure.sh' # NOTE(scanny): This test is disabled because it routinely flakes on OCR differences # 's3.sh' - 'google-drive.sh' - 'gcs.sh' 'azure.sh' ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1c6678160c..657c99ab3b 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.6-dev1" # pragma: no cover +__version__ = "0.17.6-dev2" # pragma: no cover From b585df15881219ff3b1dcb06208b4e5cd3987ae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= <124889668+mpolomdeepsense@users.noreply.github.com> Date: Tue, 29 Apr 2025 15:29:44 +0200 Subject: [PATCH 12/15] fix: Add missing diffstat command to test_json_to_html CI job (#3992) Removed some additional html fixtures. The original json fixtures from which html ones were generated, were removed some time ago. --- .github/workflows/ci.yml | 1 + Makefile | 1 + .../biomed-api/65/11/main.PMC6312790.pdf.html | 563 ------------------ .../biomed-api/75/29/main.PMC6312793.pdf.html | 329 ---------- .../07/07/sbaa031.073.PMC7234218.pdf.html | 53 -- 5 files changed, 2 insertions(+), 945 deletions(-) delete mode 100644 test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.html delete mode 100644 test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.html delete mode 100644 test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81a9f20cfb..94e2d08612 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -345,6 +345,7 @@ jobs: PYTHONPATH: ${{ github.workspace }} run: | source .venv/bin/activate + sudo apt-get install diffstat ./test_unstructured_ingest/check-diff-expected-output-html.sh test_unstructured_api_unit: diff --git a/Makefile b/Makefile index 80600a051a..fe1350d5f5 100644 --- a/Makefile +++ b/Makefile @@ -340,4 +340,5 @@ run-jupyter: .PHONY: html-fixtures-update html-fixtures-update: + rm -r test_unstructured_ingest/expected-structured-output-html && \ test_unstructured_ingest/structured-json-to-html.sh test_unstructured_ingest/expected-structured-output-html diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.html deleted file mode 100644 index 5dfa8c4b41..0000000000 --- a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/65/11/main.PMC6312790.pdf.html +++ /dev/null @@ -1,563 +0,0 @@ - - - - - - Codestin Search App - - -
    - Data in Brief 22 (2019) 451–457 -
    -

    - Contents lists available at ScienceDirect -

    -

    - Data in Brief -

    -

    - journal homepage: www.elsevier.com/locate/dib -

    -

    - Data Article -

    -

    - Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment -

    -

    - Omotayo Sanni n, Abimbola Patricia I. Popoola -

    -

    - Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa -

    -

    - a r t i c l e i n f o -

    -

    - a b s t r a c t -

    -

    - Article history: Received 31 August 2018 Received in revised form 17 November 2018 Accepted 27 November 2018 Available online 30 November 2018 -

    -

    - Keywords: Corrosion Stainless steel Inhibitor Sulphuric acid -

    -

    - This data article contains data related to the research article entitled “enhanced corrosion resistance of stainless steel Type 316 in sulphuric acid solution using eco-friendly waste product” (Sanni et al., 2018). In this data article, a comprehensive effect of waste product and optimized process parameter of the inhibitor in 0.5 M H2SO4 solution was presented using weight loss and potentiody- the inhibitor namic polarization techniques. The presence of (egg shell powder) influenced corrosion resistance of stainless steel. Inhibition efficiency value of 94.74% was recorded as a result of inhibition of the steel by the ionized molecules of the inhibiting compound of the egg shell powder influencing the redox mechan- ism reactions responsible for corrosion and surface deterioration. -

    -

    - & 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/). -

    -

    - Specification table -

    -

    - Subject area More specific subject area Surface science and engineering Type of data -

    -

    - Materials engineering -

    -

    - Table and figure -

    -

    - n Corresponding author. tayo.sanni@yahoo.com; SanniO@tut.ac.za -

    -

    - E-mail address: tayo.sanni@yahoo.com (O. Sanni). -

    -

    - https://doi.org/10.1016/j.dib.2018.11.134 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/). -

    -

    - 452 -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -

    - How data were acquired -

    -

    - Data format Experimental factors -

    -

    - Experimental features Data source location -

    -

    - Accessibility Related research article -

    -

    - The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225–230. -

    -

    - Value of the data -

    -

    - (cid:1) Data presented here provide optimum conditions of waste material as inhibitor for stainless steel Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment. -

    -

    - (cid:1) The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316 can be used as basis in determining the inhibitive performance of the same inhibitor in other environments. -

    -

    - (cid:1) The data can be used to examine the relationship between the process variable as it affect the -

    -

    - nature of inhibition of metals. -

    -
  • - 1. Data -
  • -

    - The results of the experiment are presented in this session. The results obtained from weight loss method for stainless steel Type 316 immersed in 0.5 M H2SO4 solution in the absence and presence of different concentrations of egg shell powder (ES) are presented in Figs.1–3 respectively. It can be seen clearly from these Figures that the efficiency of egg shell powder increase with the inhibitor con- centration, The increase in its efficiency could be as a result of increase in the constituent molecule -

    -

    - ) g m -

    -

    - ( -

    -

    - s s o -

    -

    - l -

    -

    - t h g e W -

    -

    - i -

    -

    - 30 -

    -

    - 20 -

    -

    - 10g 8g 6g 4g 2g Control -

    -

    - 10 -

    -

    - 48 -

    -

    - 96 -

    -

    - 144 -

    -

    - 192 -

    -

    - Exposure Time (Hours) -

    -

    - Fig. 1. Weight loss versus exposure time for stainless steel presence of ES. -

    -

    - immersed in 0.5 M H2SO4 solution in the absence and -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -

    - 2.7 -

    -

    - ) r a e y / m m -

    -

    - ( e t a r n o s o r r o C -

    -

    - i -

    -

    - 1.8 -

    -

    - 0.9 -

    -

    - 10g 8g 6g 4g 2g Control -

    -

    - 24 -

    -

    - 48 -

    -

    - 72 -

    -

    - 96 -

    -

    - 120 -

    -

    - 144 -

    -

    - 168 -

    -

    - 192 -

    -

    - Exposure time -

    -

    - Fig. 2. Corrosion rate versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the absence and presence of ES. -

    -

    - 100 -

    -

    - 90 -

    -

    - ) -

    -

    - % -

    -

    - ( -

    -

    - y c n e c i f f -

    -

    - i -

    -

    - E n o i t i b h n I -

    -

    - i -

    -

    - 80 -

    -

    - 70 -

    -

    - 60 -

    -

    - 50 -

    -

    - 40 -

    -

    - 30 -

    -

    - 2g 4g 6g 8g 10g -

    -

    - 20 -

    -

    - 10 -

    -

    - 0 -

    -

    - 20 -

    -

    - 40 -

    -

    - 60 -

    -

    - 80 -

    -

    - 100 -

    -

    - 120 -

    -

    - 140 -

    -

    - 160 -

    -

    - 180 -

    -

    - Exposure Time (Hours) -

    -

    - Fig. 3. Inhibition efficiency versus exposure time for stainless steel immersed in 0.5 M H2SO4 solution in the presence of ES. -

    -

    - number of inhibitor adsorbed on the surface of stainless steel at higher concentration, in order for the active sites of the stainless steel to be protected with the inhibitor molecules. Cathodic and anodic polarized potential are measured in the presence and absence of ES. Fig. 4 shows the cathodic and anodic polarization curves for stainless steel in 0.5 M H2SO4 solution at different ES concentrations. The electrochemical variables such as polarization resistance (PR), corrosion potential (Ecorr), cor- rosion current (icorr), anodic Tafel constant (ba), cathodic Tafel constant (bc) and corrosion rate (mm/ year) values are presented in Table 1. From the polarization curves and electrochemical parameter, icorr value decreased with the addition of inhibitor in 0.5 M H2SO4. Conversely, the icorr further decrease with an increase in inhibitor concentration indicating that the inhibition effects increase with an increase in the egg shell concentration. The process of egg shell inhibition could be attributed to the formation of egg shell powder adsorbed on stainless steel surface protecting corrosion of stainless steel in H2SO4 medium. The likely mechanism is the egg shell adsorption on stainless steel surface through the heteroatoms electron pair and the conjugated systems in egg shell molecular structure as shown in Fig. 1. When the concentration of inhibitor was increased from 2 to 10 g, the corrosion rate values drastically decreased this result show that waste egg shell powder is an effective corrosion inhibitor for stainless steel in H2SO4 solution. The shift in corrosion potential of stainless steel from Tafel curves and electrochemical data indicate that the inhibitor is a mixed-type corrosion inhibitor. -

    -

    - 453 -

    -

    - 454 -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -

    - Fig. 4. Anodic and cathodic polarization curve of stainless steel in 0.5 M H2SO4 solution in the presence and absence of ES. -

    -

    - Table 1 Potentiodynamic polarization data for stainless steel in the absence and presence of ES in 0.5 M H2SO4 solution. -

    -

    - Inhibitor concentration (g) -

    -

    - bc (V/dec) -

    -

    - ba (V/dec) -

    -

    - Ecorr (V) -

    -

    - icorr (A/cm2) -

    -

    - Polarization resistance (Ω) -

    -

    - Corrosion rate (mm/year) -

    -

    - 0 2 4 6 8 10 -

    -

    - 0.0335 1.9460 0.0163 0.3233 0.1240 0.0382 -

    -

    - 0.0409 0.0596 0.2369 0.0540 0.0556 0.0086 -

    -

    - (cid:3)0.9393 (cid:3)0.8276 (cid:3)0.8825 (cid:3)0.8027 (cid:3)0.5896 (cid:3)0.5356 -

    -

    - 0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05 -

    -

    - 24.0910 121.440 42.121 373.180 305.650 246.080 -

    -

    - 2.8163 1.5054 0.9476 0.4318 0.3772 0.0919 -

    -

    - The plot of inhibitor concentration over degree of surface coverage versus inhibitor concentration gives a straight line as shown in Fig. 5. The strong correlation reveals that egg shell adsorption on stainless surface in 0.5 M H2SO4 follow Langmuir adsorption isotherm. Figs. 6–8 show the SEM/EDX surface morphology analysis of stainless steel. Figs. 7 and 8 are the SEM/EDX images of the stainless steel specimens without and with inhibitor after weight loss experiment in sulphuric acid medium. The stainless steel surface corrosion product layer in the absence of inhibitor was porous and as a result gives no corrosion protection. With the presence of ES, corrosion damage was minimized, with an evidence of ES present on the metal surface as shown in Fig. 8. -

    -

    - 12 -

    -

    - C/0 -

    -

    - 10 -

    -

    - 8 -

    -

    - 0 / C -

    -

    - 6 -

    -

    - 4 -

    -

    - 2 -

    -

    - 2 -

    -

    - 4 -

    -

    - 6 -

    -

    - 8 -

    -

    - 10 -

    -

    - Concentration (g) -

    -

    - Fig. 5. Langmuir adsorption isotherm of ES. -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -

    - Fig. 6. SEM/EDX image of as-received stainless steel. -

    -

    - Fig. 7. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution without inhibitor. -

    -

    - Fig. 8. SEM/EDX image of stainless steel immersed in 0.5 M H2SO4 solution with the presence of inhibitor. -

    -

    - 455 -

    -

    - 456 -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -
  • - 2. Experimental design, materials and methods -
  • -

    - 2.1. Material -

    -

    - Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [3–5]. The structural formula of egg shell powder is shown in Fig. 9. -

    -

    - Fig. 9. Chemical structure of egg shell powder. -

    -

    - 2.2. Weight loss method -

    -

    - This physical measurement was carried out in order to provide direct result on how the corrosive environment affects the test sample. The cleaned and weighed specimen was suspended in beakers with the aid of glass hooks and rods with the test solution of ES at different concentration (2, 4, 6, 8 and 10 g). The pre-weighed specimen was retrieved from the test solution after every 24 h, cleaned, dried and reweighed. The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss which was used to calculate corrosion rate and inhibition efficiency. -

    -

    - The corrosion rate (CR) was calculated using Eq. (1) [1–5] -

    -

    - (cid:1) Þ ¼ 87:6W DAT -

    -

    - (cid:3) -

    -

    - Corrosion rate CRð -

    -

    - where: W is weight loss in mg, A is specimen surface area, T is immersion period in hours and D is the specimen density. From the corrosion rate, the surface coverage (θ) and inhibition efficiencies (IE %) were determined using Eqs. (2) and (3) respectively -

    -

    - θ ¼ CRo(cid:3)CR -

    -

    - CRo -

    -

    - IE ð%Þ ¼ CRo(cid:3)CR -

    -

    - CRo -

    -

    - x -

    -

    - 100 1 -

    -

    - where: CRo and CR are the corrosion rate in absence and presence of inhibitor respectively. -

    -

    - 2.3. Potentiodynamic polarization method -

    -

    - The potentiodynamic polarization method was performed on the prepared test samples immersed in 0.5 M H2SO4 solution in the presence and absence of different ES concentrations. A three electrode system was used; stainless steel Type 316 plate as working electrode with an exposed area of 1.0 cm2, platinum rod as counter electrode and silver chloride electrode as reference electrode. The electrode was polished, degreased in acetone and thoroughly rinsed with distilled water before the experiment. Current density against applied potential was plotted. The slope of the linear part in anodic and cathodic plots gives anodic and cathodic constants according to the Stern–Geary equation, and the -

    -

    - ð1Þ -

    -

    - ð2Þ -

    -

    - ð3Þ -

    -
    - O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457 -
    -

    - steps of the linear polarization plot are substituted to get corrosion current. Nova software was used with linear polarization resistance (LPR) and the current was set to 10 mA (maximum) and 10 nA (minimum). LSV staircase parameter start potential (cid:3)1.5 v, step potential 0.001 m/s and stop potential of þ1.5 v set was used in this study. -

    -

    - Acknowledgements -

    -

    - This work was supported by the National Research Foundation of South Africa and the Tshwane -

    -

    - University of Technology Pretoria South Africa. -

    -

    - Transparency document. Supporting information -

    -

    - Transparency document associated with this article can be found in the online version at https://doi. -

    -

    - org/10.1016/j.dib.2018.11.134. -

    -

    - References -

    -

    - [1] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution -

    -

    - using eco-friendly waste product, Results Phys. 9 (2018) 225–230. -

    -

    - [2] O. Sanni, A.P.I. Popoola, A. Kolesnikov, Constitutive modeling for prediction of optimal process parameters in corrosion -

    -

    - inhibition of austenitic stainless steel (Type 316)/acidic medium, Mater. Res. Express. 5 (10) (2018) 1–15. -

    -

    - [3] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, The inhibitive study of egg shell powder on UNS N08904 austenitic stainless steel -

    -

    - corrosion in chloride solution, Def. Technol. 14 (2018) 463–468. -

    -

    - [4] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, C.A. Loto, A comparative study of inhibitive effect of waste product on stainless steel corrosion in sodium chloride/sulfuric acid environments, Metallogr. Microstruct. Anal. (2018) 1–17. https://doi.org/10.1007/ s13632-018-0495-5. -

    -

    - [5] O. Sanni, A.P.I. Popoola, O.S.I. Fayomi, Inhibition of engineering material in sulphuric acid solution using waste product, Contributed Papers from Materials Science and Technology (MS&T18), 2018. 〈https://doi.org/10.7449/2018/MST_2018_254_261〉. -

    -

    - 457 -

    - - diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.html deleted file mode 100644 index 9c6a0058ce..0000000000 --- a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-api/75/29/main.PMC6312793.pdf.html +++ /dev/null @@ -1,329 +0,0 @@ - - - - - - Codestin Search App - - -
    - Data in Brief 22 (2019) 484–487 -
    -

    - Contents lists available at ScienceDirect -

    -

    - Data in Brief -

    -

    - journal homepage: www.elsevier.com/locate/dib -

    -

    - Data Article -

    -

    - A benchmark dataset for the multiple depot vehicle scheduling problem -

    -

    - Sarang Kulkarni a,b,c,n, Mohan Krishnamoorthy d,e, Abhiram Ranade f, Andreas T. Ernst c, Rahul Patil b -

    -

    - a IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India b SJM School of Management, IIT Bombay, Powai, Mumbai 400076, India c School of Mathematical Sciences, Monash University, Clayton, VIC 3800, Australia d Department of Mechanical and Aerospace Engineering, Monash University, Clayton, VIC 3800, Australia e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072, Australia f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India -

    -

    - a r t i c l e i n f o -

    -

    - a b s t r a c t -

    -

    - Article history: Received 21 November 2018 Received in revised form 13 December 2018 Accepted 15 December 2018 Available online 18 December 2018 -

    -

    - This data article presents a description of a benchmark dataset for the multiple depot vehicle scheduling problem (MDVSP). The MDVSP is to assign vehicles from different depots to timetabled trips to minimize the total cost of empty travel and waiting. The dataset has been developed to evaluate the heuristics of the MDVSP that are presented in “A new formulation and a column generation-based heuristic for the multiple depot vehicle sche- duling problem” (Kulkarni et al., 2018). The dataset contains 60 problem instances of varying size. Researchers can use the dataset to evaluate the future algorithms for the MDVSP and compare the performance with the existing algorithms. The dataset includes a program that can be used to generate new problem instances of the MDVSP. -

    -

    - & 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/). -

    -

    - DOI of original article: https://doi.org/10.1016/j.trb.2018.11.007 n Corresponding author at: IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai 400076, India. -

    -

    - E-mail address: sarangkulkarni@iitb.ac.in (S. Kulkarni). -

    -

    - https://doi.org/10.1016/j.dib.2018.12.055 2352-3409/& 2018 Published by Elsevier Inc. This is an open access article under the CC BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/). -

    -
    - S. Kulkarni et al. / Data in Brief 22 (2019) 484–487 -
    -

    - 485 -

    -

    - Specifications table -

    -

    - Subject area Operations research More specific subject area Vehicle scheduling Type of data How data were acquired -

    -

    - Tables, text files Artificially generated by a Cþ þ program on Intels Xeons CPU E5– 2670 v2 with Linux operating system. Raw Sixty randomly generated instances of the MDVSP with the number of depots in (8,12,16) and the number of trips in (1500, 2000, 2500, 3000) Randomly generated instances IITB-Monash Research Academy, IIT Bombay, Powai, Mumbai, India. Data can be downloaded from https://orlib.uqcloud.net/ Kulkarni, S., Krishnamoorthy, M., Ranade, A., Ernst, A.T. and Patil, R., 2018. A new formulation and a column generation-based heuristic for the multiple depot vehicle scheduling problem. Transportation Research Part B: Methodological, 118, pp. 457–487 [3]. -

    -

    - Data format Experimental factors -

    -

    - Experimental features Data source location Data accessibility Related research article -

    -

    - Value of the data -

    -

    - (cid:2) The dataset contains 60 different problem instances of the MDVSP that can be used to evaluate the -

    -

    - performance of the algorithms for the MDVSP. -

    -

    - (cid:2) The data provide all the information that is required to model the MDVSP by using the existing -

    -

    - mathematical formulations. -

    -

    - (cid:2) All the problem instances are available for use without any restrictions. (cid:2) The benchmark solutions and solution time for the problem instances are presented in [3] and can -

    -

    - be used for the comparison. -

    -

    - (cid:2) The dataset includes a program that can generate similar problem instances of different sizes. -

    -
  • - 1. Data -
  • -

    - The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number ‘RN-8–1500-01.dat’, for is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm;nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. -

    -

    - ‘ðm;nÞ’, -

    -

    - the size, -

    -

    - respectively. For example, -

    -

    - the problem instance, -

    -

    - For each problem instance, the following information is provided: The number of depots mð The number of trips ðnÞ, The number of locations ðlÞ, The number of vehicles at each depot, For each trip iA1;2;…;n, a start time, ts -

    -

    - Þ, -

    -

    - i , a start location, ls -

    -

    - i, an end time, te -

    -

    - i, and an end location, le i , -

    -

    - and -

    -

    - (cid:2) The travel time, δij, between any two locations i;jA1;…;l. -

    -

    - All times are in minutes and integers. The planning duration is from 5 a.m. to around midnight. Each instance has two classes of trips, short trips and long trips, with 40% short trips and 60% long trips. The duration of a short trip is less than a total of 45 min and the travel time between the start -

    -

    - 486 -

    -
    - S. Kulkarni et al. / Data in Brief 22 (2019) 484–487 -
    -

    - and end location of the trip. A long trip is about 3–5 h in duration and has the same start and end location. For all instances, mrl and the locations 1;…;m correspond to depots, while the remaining locations only appear as trip start and end locations. -

    -

    - i þδ -

    -

    - . If le i ls le i j, otherwise, the vehicle may require waiting at le i for the duration of ðts -

    -

    - Zte -

    -

    - als -

    -

    - A trip j can be covered after trip i by the same vehicle, if ts j -

    -

    - j, the vehicle must travel empty from le j (cid:3)te i Þ. A schedule is given by the sequence in which a vehicle can cover the trips. The MDVSP is to determine the minimum number of schedules to cover all trips that minimizes total time in waiting and empty travel. The following requirements must be satisfied: -

    -

    - j -

    -

    - i to ls -

    -
  • - 1. Each schedule should start and end at the same depot. 2. Each trip should be covered by only one vehicle. 3. The number of schedules that start from a depot should not exceed the number of vehicles at the depot. -
  • -

    - A sufficient number of vehicles are provided to maintain the feasibility of an instance. For each instance size ðm;nÞ, Table 1 provides the average of the number of locations, the number of times, the number of vehicles, and the number of possible empty travels, over five instances. The number of locations includes m distinct locations for depots and the number of locations at which various trips start or end. The number of times includes the start and the end time of the planning horizon and the start/end times for the trips. The number of vehicles is the total number of vehicles from all the depots. The number of possible empty travels is the number of possible connections between trips that require a vehicle travelling empty between two consecutive trips in a schedule. -

    -

    - The description of the file for each problem instance is presented in Table 2. The first line in the file provides the number of depots ðmÞ, the number of trips, ðnÞ, and the number of locations ðlÞ, in the problem instance. The next n lines present the information for n trips. Each line corresponds to a trip, iA 1;…;n g, and provides the start location, the start time, the end location, and the end time of trip i. The next l lines present the travel times between any two locations, i;jA 1;…;l -

    -

    - f -

    -

    - (cid:1) -

    -

    - (cid:3) -

    -

    - . -

    -

    - The dataset also includes a program ‘GenerateInstance.cpp’ that can be used to generate new instances. The program takes three inputs, the number of depots ðmÞ, the number of trips ðnÞ, and the number of instances for each size ðm;nÞ. -

    -

    - Table 1 Average number of locations, times, vehicles and empty travels for each instance size. -

    -

    - Instance size (m, n) -

    -

    - Average number of -

    -

    - Locations -

    -

    - Times -

    -

    - Vehicles -

    -

    - Possible empty travels -

    -

    - (8, 1500) (8, 2000) (8, 2500) (8, 3000) (12, 1500) (12, 2000) (12, 2500) (12, 3000) (16, 1500) (16, 2000) (16, 2500) (16, 3000) -

    -

    - 568.40 672.80 923.40 977.00 566.00 732.60 875.00 1119.60 581.80 778.00 879.00 1087.20 -

    -

    - 975.20 1048.00 1078.00 1113.20 994.00 1040.60 1081.00 1107.40 985.40 1040.60 1083.20 1101.60 -

    -

    - 652.20 857.20 1082.40 1272.80 642.00 861.20 1096.00 1286.20 667.80 872.40 1076.40 1284.60 -

    -

    - 668,279.40 1,195,844.80 1,866,175.20 2,705,617.00 674,191.00 1,199,659.80 1,878,745.20 2,711,180.40 673,585.80 1,200,560.80 1,879,387.00 2,684,983.60 -

    -
    - S. Kulkarni et al. / Data in Brief 22 (2019) 484–487 -
    -

    - Table 2 Description of file format for each problem instance. -

    -

    - Number of lines -

    -

    - Number of columns in each line -

    -

    - Description -

    -

    - 1 1 n -

    -

    - l -

    -

    - 3 m 4 -

    -

    - l -

    -

    - The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1;2;…;n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i;jA1;2;…;l, refers to the travel time between location i and location j. -

    -

    - i, the start -

    -

    - i, the end location le -

    -
  • - 2. Experimental design, materials, and methods -
  • -

    - The procedure presented by Carpaneto et al. in [1] is used to generate the problem instances. The same procedure has been used by Pepin et al. in [4] to generate the benchmark dataset of the MDVSP. A detailed description of the procedure is presented in [3]. -

    -

    - Our dataset provides start/end location and time of trips as well as the travel time between any two locations. The location and time information is required to model the MDVSP on a time-space network. The feasible connections and the cost of connections between the trips can be obtained as discussed in [3]. Thus, the dataset has all the information that is required to model the MDVSP on the time-space network (see [2]) as well as the connection-network (see [5]). The benchmark solutions for all the problem instances are presented in [3]. -

    -

    - Transparency document. Supporting information -

    -

    - Transparency document associated with this article can be found in the online version at https://doi. -

    -

    - org/10.1016/j.dib.2018.12.055. -

    -

    - References -

    -

    - [1] G. Carpaneto, M. Dell'Amico, M. Fischetti, P. Toth, A branch and bound algorithm for the multiple depot vehicle scheduling -

    -

    - problem, Networks 19 (5) (1989) 531–548. -

    -

    - [2] N. Kliewer, T. Mellouli, L. Suhl, A time–space network based exact optimization model for multi-depot bus scheduling, Eur. -

    -

    - J. Oper. Res. 175 (3) (2006) 1616–1627. -

    -

    - [3] S. Kulkarni, M. Krishnamoorthy, A. Ranade, A.T. Ernst, R. Patil, A new formulation and a column generation-based heuristic -

    -

    - for the multiple depot vehicle scheduling problem, Transp. Res. Part B Methodol. 118 (2018) 457–487. -

    -

    - [4] A.S. Pepin, G. Desaulniers, A. Hertz, D. Huisman, A comparison of five heuristics for the multiple depot vehicle scheduling -

    -

    - problem, J. Sched. 12 (1) (2009) 17. -

    -

    - [5] C.C. Ribeiro, F. Soumis, A column generation approach to the multiple-depot vehicle scheduling problem, Oper. Res. 42 (1) -

    -

    - (1994) 41–52. -

    -

    - 487 -

    - - diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html deleted file mode 100644 index fc5c096764..0000000000 --- a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html +++ /dev/null @@ -1,53 +0,0 @@ - - - - - - Codestin Search App - - -
    - S32 -
    -

    - ns; 40 mg/day=3.6%, p<0.05; 80 mg/day=4.9%, p<0.01; 120 mg/day=9.3%, p<0.001, PM dosing group: 20 mg/day=-0.4%, ns; 40 mg/day=2.8%, p<0.05: 80 mg/day=0.2%, ns; 160 mg/day=5.8%, p<0.05). There was no clear dose-dependent trend associated with nausea and RD was similar between AM and PM dosing group (AM dosing group: 20 mg/ day=0.2% ns; 40 mg/day=3.8%, p<0.05; 80 mg/day=3.8%, ns; 120 mg/ day=6.6%, ns, PM dosing group: 20 mg/day=-1.6%, ns; 40 mg/day=-1.7%, ns; 80 mg/day=5.5%, p<0.01; 160 mg/day=2.8%, ns). Discussion: The risk of adverse events in the treatment of schizophrenia with lurasidone can vary depending on the timing of administration. In particular, for akathisia and somnolence, the incidence risks were reduced when lurasidone was administered in PM. Unlike with AM administration, the dose-dependence in the risks of these adverse events were not observed in lurasidone PM administration. The timing of lurasidone administration could be considered in effort to minimize potential adverse events. -

    -

    - S6. SLEEP ENDOPHENOTYPES OF SCHIZOPHRENIA: A HIGH-DENSITY EEG STUDY IN DRUG-NAÏVE, FIRST EPISODE PSYCHOSIS PATIENTS -

    -

    - Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy -

    -

    - Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC. -

    - -
    - Poster Session I -
    -

    - Discussion: Our data confirm previous findings on reduced slow wave density in FEP, and expand them to acute subjects, before any treatment is prescribed. This is in line with available data on diffuse abnormalities of cortico-cortical and cortico-thalamic networks in these patients. Interestingly, our data also offer preliminary evidence that this deficit is specific for SCZ, as it appears to differentiate patients who developed SCZ from those with other diagnoses at follow-up. Given the traveling properties of slow waves, future research should establish their potential as markers of connectivity in SCZ. -

    -

    - S7. INVESTIGATING THE LINK BETWEEN THE PERIPHERAL ENDOCANNABINOID SYSTEM AND CENTRAL GLUTAMATERGIC NEUROTRANSMISSION IN EARLY PSYCHOSIS: A 7T-MRS STUDY -

    -

    - Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford -

    -

    - Background: Meta-analytic evidence showed increased levels of periph- eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their first non-affective psychotic episode (NA-FEP) and healthy controls. Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session. Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP Discussion: This study will help clarifying the contribution of peripheral endocannabinoid system to central brain mechanisms of key relevance for psychotic illness. It will also add further evidence on the limited literature on high-resolution characterisation of brain metabolites in early psychosis. Strengths of the study include: (i) use of high-field MRS, which allows the estimation of glutamate-related compounds at higher precision than at lower field strength; (ii) reduced heterogeneity of the clinical sample (only male and NA-FEP). Limitations: small sample size and cross-sectional design. -

    -

    - S8. GRIN1 PROMOTER METHYLATION CHANGES IN BLOOD OF EARLY-ONSET PSYCHOTIC PATIENTS AND UNAFFECTED SIBLINGS WITH CHILDHOOD TRAUMA -

    -

    - Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2, -

    -

    - AQ3 -

    - - From 604c4a7c5e06381f6480c811bcd9caea949a1366 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= <124889668+mpolomdeepsense@users.noreply.github.com> Date: Tue, 29 Apr 2025 15:29:58 +0200 Subject: [PATCH 13/15] fix: failing build (#3993) Successful build and test: https://github.com/Unstructured-IO/unstructured/actions/runs/14730300234/job/41342657532 Failing test_json_to_html CI job fix here: https://github.com/Unstructured-IO/unstructured/pull/3992 --- scripts/docker-smoke-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/docker-smoke-test.sh b/scripts/docker-smoke-test.sh index 1d0950e923..0e66e05ae4 100755 --- a/scripts/docker-smoke-test.sh +++ b/scripts/docker-smoke-test.sh @@ -41,7 +41,7 @@ await_container docker cp test_unstructured_ingest $CONTAINER_NAME:/app docker cp requirements/ingest $CONTAINER_NAME:/app/requirements/ingest docker exec -u root "$CONTAINER_NAME" /bin/bash -c "chown -R notebook-user:notebook-user /app/test_unstructured_ingest" -docker exec "$CONTAINER_NAME" /bin/bash -c "/app/test_unstructured_ingest/src/wikipedia.sh" +docker exec "$CONTAINER_NAME" /bin/bash -c "/app/test_unstructured_ingest/src/local.sh" result=$? exit $result From b814ece39f5a66e459f4ddcd21d7dafa882bb572 Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 5 May 2025 13:08:11 -0500 Subject: [PATCH 14/15] fix: properly handle the case when an element's text is None (#3995) Some elements, like `Image`, can have `None` as its `text` attribute's value. In that case current chunking logic fails because it expects the field to always have a length or can be split. The fix is to update the logic as `element.text or ""` for checking length and add flow control to early exit to avoid calling split on `None`. --- CHANGELOG.md | 5 +++-- test_unstructured/chunking/test_base.py | 11 +++++++++++ unstructured/__version__.py | 2 +- unstructured/chunking/base.py | 4 +++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad5dea531f..a9b4c3ca53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.17.6-dev2 +## 0.17.6 ### Enhancements @@ -10,6 +10,7 @@ Two executions of the same code, on the same file, produce different results. Th This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) - Resolve open CVEs +- Properly handle the case when an element's `text` attribute is None ## 0.17.5 @@ -48,7 +49,7 @@ This makes it impossible to write stable unit tests, for example, or to obtain r ### Features ### Fixes -- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml +- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml ## 0.17.2 diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index f63e738a7c..ffaa699cac 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -31,6 +31,7 @@ CompositeElement, Element, ElementMetadata, + Image, PageBreak, Table, TableChunk, @@ -234,6 +235,10 @@ def it_accumulates_elements_added_to_it(self): assert builder._text_length == 112 assert builder._remaining_space == 36 + def it_will_fit_when_element_has_none_as_text(self): + builder = PreChunkBuilder(opts=ChunkingOptions()) + assert builder.will_fit(Image(None)) + def it_will_fit_an_oversized_element_when_empty(self): builder = PreChunkBuilder(opts=ChunkingOptions()) assert builder.will_fit(Text("abcd " * 200)) @@ -405,6 +410,12 @@ def and_it_knows_it_is_NOT_equal_to_an_object_that_is_not_a_PreChunk(self): pre_chunk = PreChunk([], overlap_prefix="", opts=ChunkingOptions()) assert pre_chunk != 42 + def it_can_handle_element_with_none_as_text(self): + pre_chunk = PreChunk( + [Image(None), Text("hello")], overlap_prefix="", opts=ChunkingOptions() + ) + assert pre_chunk._text == "hello" + @pytest.mark.parametrize( ("max_characters", "combine_text_under_n_chars", "expected_value"), [ diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 657c99ab3b..29149d1540 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.6-dev2" # pragma: no cover +__version__ = "0.17.6" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 695393c55c..17ece85a47 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -387,7 +387,7 @@ def will_fit(self, element: Element) -> bool: if self._text_length > self._opts.soft_max: return False # -- don't add an element if it would increase total size beyond the hard-max -- - return not self._remaining_space < len(element.text) + return not self._remaining_space < len(element.text or "") @property def _remaining_space(self) -> int: @@ -503,6 +503,8 @@ def _iter_text_segments(self) -> Iterator[str]: if self._overlap_prefix: yield self._overlap_prefix for e in self._elements: + if e.text is None: + continue text = " ".join(e.text.strip().split()) if not text: continue From e3417d7e98b8ffba47ed75c65be6cff3fc465764 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Thu, 8 May 2025 17:57:05 -0400 Subject: [PATCH 15/15] fix: Fix for Pillow error when extracting PNG images (#3998) When I tried to partition a PNG file and extract images, I got an error from Pillow: ``` WARNING unstructured:pdf_image_utils.py:230 Image Extraction Error: Skipping the failed image Traceback (most recent call last): File "/Users/austin/.pyenv/versions/unstructured/lib/python3.10/site-packages/PIL/JpegImagePlugin.py", line 666, in _save rawmode = RAWMODE[im.mode] KeyError: 'RGBA' ``` The issue is that a PNG has an additional layer that cannot be saved off in jpeg format. We can fix this with a quick conversion. I added a png test case that is now passing with this fix. --- CHANGELOG.md | 9 +++++++++ .../partition/pdf_image/test_pdf_image_utils.py | 1 + unstructured/__version__.py | 2 +- unstructured/partition/pdf_image/pdf_image_utils.py | 5 +++++ 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9b4c3ca53..20a4bcaf71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.7-dev0 + +### Enhancements + +### Features + +### Fixes +- **Fix image extraction for PNG files.** When `extract_image_block_to_payload` is True, and the image is a PNG, we get a Pillow error. We need to remove the PNG transparency layer before saving the image. + ## 0.17.6 ### Enhancements diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index bfb09b762a..1be79e92a0 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -73,6 +73,7 @@ def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-i [ (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False), (example_doc_path("img/layout-parser-paper-fast.jpg"), True), + (example_doc_path("img/english-and-korean.png"), True), ], ) @pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE]) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 29149d1540..d53993104e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.6" # pragma: no cover +__version__ = "0.17.7-dev0" # pragma: no cover diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index a7e98aa2fa..4365b8dba5 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -204,6 +204,11 @@ def save_elements( image_path = image_paths[page_index] image = Image.open(image_path) cropped_image = image.crop(padded_bbox) + + # PNG images with transparency need to be converted before saving + if cropped_image.mode == "RGBA": + cropped_image = cropped_image.convert("RGB") + if extract_image_block_to_payload: buffered = BytesIO() cropped_image.save(buffered, format="JPEG")