From a4bf8918424da9699c1a92ac9e1f955ec91f3712 Mon Sep 17 00:00:00 2001 From: Frank Sauerburger Date: Fri, 9 Jul 2021 21:29:32 +0200 Subject: [PATCH 1/2] Test support of chars beyond BMP in PDF output Test that character beyond the Basic Multilingual Plane are supported in PDF files with Type 42 fonts. The example tests STIX Sans math chars and a char from an old script. --- .../test_text/text_pdf_chars_beyond_bmp.pdf | Bin 0 -> 9428 bytes lib/matplotlib/tests/test_text.py | 8 ++++++++ 2 files changed, 8 insertions(+) create mode 100644 lib/matplotlib/tests/baseline_images/test_text/text_pdf_chars_beyond_bmp.pdf diff --git a/lib/matplotlib/tests/baseline_images/test_text/text_pdf_chars_beyond_bmp.pdf b/lib/matplotlib/tests/baseline_images/test_text/text_pdf_chars_beyond_bmp.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8890790d2ea21383ef3f54dd03102d898d7ad544 GIT binary patch literal 9428 zcmc(Fc|4Te`#-728by{OjmlD4W-(*NlI&Zu??w$KyTJ@ulFE{?hLD|X*-{}TOGTrU zt?Wxg_H5a+{q9kEl+Wk+KHoonukZZhp38Ngb8Y9G_w_!*t0}K21Q8acasg=9K;U zSMKLu6%ZOEqGOG*v~jcoA%47Ws9|o4wIG1Rwg&&(9LP=wWE>rF1Uv`={$YU}U=7w0 z5TJg1SI1h~U}SObfDL2~Q3O~R34)2jg%Kb$U?9c?P~LW58;i%ex>x}5AUpgk9^?@1 z2nKoJ?dU1m*aQB8L=^3TfXHJlaF#&)&SM>|2-YCTkDvl(;|VTUj02r}V(4ut+$->b zfVgRc9_3p|D~XLas#FK%sE_20AIaq5RXulGd*fPo#`rBm4zsZeugQ^yu%hpHgVx5C zJ;&$+ee!H6h(dWXuVP$fh^Rcuf!td)kuzV*aM>&;1P$f~KT$q0MO@9nqL-Z-Z2R9B zz6yLAXDGv+sdN6mr=i~&wBa;EfLNdbGaS?L>gwYB^|XChzoo8~JNZzC&vJ7{D8&!- z-YoldYUp!XyoQF*2g4%tWmVoy`YCOM$54iq^P{^(599V4B_ZudvmZHgxd)?zf*xM0 zpYZlR{d_^Gb24{S5SABn_ox$IF7gBIkhWS7gHOZ7J0XwDgf2EHJUez*CVnpB5#{op z1DqCiXm!Czp6)$pNp$OD=;caD=EHeSJwXH`khkug*=+Bb)kq-z+6bXlt^Y|+r ztp;AEy6s7uNvjoClHY@2jQ3fGDH)lDSPw9q9Z&{W?x%Y+5FnI(eYu|}mB@S?5-6ko zNL}{cBa(Une4J0zjx}AK6Mh*j*WOl(=yq$hZM$qx5K1bRXIgzuWgNBs)orQpny2tc zPv6p&YtyKuL@L_jF0-kTIP(oZC2NYfKzTZRIzOAF@!hJcT~N|Wh&K!Uv3F`cXQyZe zp`@c9YuVMI;qPoQR}*Fnl;}Ug*f(dX-^$milZr+R{B0HDvD62%AfHe^9=_phq+#f= z2hSN>c3q*PkFFk!$a*(zb|ANq_k^@%2MKBZGLn_`s8wtw$(@OMG*6HzI!=|bl-cO) zmz6}-%N=Ggm#g~GvrTom0sQNobtWNE^X{=N@0oba4%fOzCS&zR-x|BdVNMO!F071e z>Uj*bKDxyr{dzX>K|v1AFeAiR&GJnDN>5kWx2|SVAIzrv1w=p6ERSZ~j_!n0zPQFT zTM!Qu`*?(nf~kxY_xR*w>}0|wqC$CUX01*WZOvWcH=diDgEd22^OU@2(rceHG%~9l z3@YPod7N4vFt*QiC^#n*s#_0@9-6ybgw#EHFNQbmNXl8nL;qY&Io=Xokv0^ZURJL~ zyEH$Zm-BQk`;1{u_ok}+Qmt$9HMM=Bd^ba?82O*X+o9`LlE+v+jIDN~1y8Sz)s8kI zFjmjsZI*M@Pe!%&Esd_)UnNQ}oi$zJH}<$a8tj%;+oPjCASkp19o8xd4$W{5GwR|C zNUjvy@IQTs-Q?jBil|%x&w9UmcFW%xSar_}q=QyGz4kkfu_c7+4Va%Ie!^dw$hEt^ zs(J5WZS7$qSW#Mk?Bz@%(ctzgw``rj@#?B~Qe{i#(;J^AD@5j%y4%0E^%h^tSg8Fn zu_RVeHL)SLj*iWlp8ks&>@N|aWvRoS)ITECrC?ISsL1;MVqKy`U3YOn3}hq_$&f)K>p{(<(Zw#xZ5!C_|aj8g=iL(OJSBF^+2Z@1P|NlKg2v&un_V@nbga~Et& zjdR>Os{MV$hoztwWJ^}fv+ga9OF5D}&1UlYRM^zfS;3?OLnNE`Yq&MriQzb={^<`mcvibcz3)xzw4ELa`(dC^GUX#jaQqZ9z1ZY?MtqUBT2L? z>-zk6hGB(Hj8Wp|h&lc*w?+urNygVV`C{F^4;y@ML$$pQHy#l<7M)w@s<5fQOy69U zF<5@5q8z5s6z+SXUo<`>NJoHgecx+)2z~Zz^M)c0*1P9$aryC`HAGuBxsdO-c>{D4 z1{b5d4HBvF$5Q7ZN`;ot=$BfDk9O&$SGV+)J{+>q7t{*sM!twl zJ;~us+8f*meRbEt=)Pp8=n(_M_~=$rYfPwW@#%aHPyom*1Jr!SlADKg=}At9siU!_ znWHi0nxbTQY)zh1tma#aDx>A3*$W3n5@%}_a;=&3*~rOG|wy+A@veC>#Nvl!^N zjP!-9@WoJ&Ro-Z$$o`qMi&r-IZju57O?Fl&B2LHq_2Pbuhg?T`0XE{Y3uRZ^Sg! zNZz;N_UFnZq-+fdk^FBHzN#%UsoR$n8=IUkc6cxom(C&h*O}v&e2+#rT>@7I@C0gE z)Dy?v4n|f*l%IZv^04^!AdQ zVMivcz9x3JA*NqDD^)G6hzxTZX*&xRd&;d$^-mczY!;grici&NWZb*gZ&~Htk@ZAq zbzrcszCkz{RRTJIn$K)CS#qD($xL6mhk5Sid)!0f@cS#t?(3ZJ`;}?5LDkW)mTbl6 z@{FPL)nnm&W`kE3k?XXU9hKiQBt^pC&hZ;OkUVwO{LW00sYmwALnn_6@4{j)KegmT zDcmvbcSdTmxv$k<>Lih#KR*9qW#cWpd5rt>iNlE_@naz6rNk1Lqe+d=}njLLC$gMbFG)|X9==K>U|5#yzPq^Giw##E!zyU z{=!K=ZI>J84|`?x>;iqzuarClKfcJ8qf(2Vt4|BkzAF)zYjid=XSUey*s=OD ztGu%zbt~uw@pm4A7Ao&HU)nWD6)JqSL9Kq!cx)TfHc0cRVYAWVtWK~Y4$)_H^=nR~ zO|P+s;fX^Hftw?jlEY7!kSb%r@s0OG*7YCxon6wC@W-54;L_SJ9UPbERbbT_|1k6X zVv<7V#Sdk9>GMK2KV?@hHE(`8|JgNdrZD>D_ac9D!^Lxo`T^HYebtWlt+=Q>|8%yN zV&gWkQNrn6SmQ}Y)gGFD9f@mwHzp|89DCg-uQ0cZJC#rwc+R9qEi(}{6hD`GJQbXs zj$2;VPFr-Q7Jk&9DQmi>Ugq|Qx=N*XA90R(1UidzTq~ohid@d5-|X@WXB=Q>wlZ3? zXK6@GzMo|CYK(|nnB(a>MNDCO*?97T>RM5a25*_f%xBv0#k3k)rj1i9wY*^k&bGDY zGuK4JRJkIMg0smr8Yiy_&OWFKKgp=Vb;rGyNL4^Nd0oDJ!`C>rX2YAeYs0srFMnzk zbFSNf`RnV-)bG_4i>hmK)SqIOZ^&oYm@wmFK2J|}6iK8`o~oWg3p-q)cy8+M>w7Jo zT7tJybMGT<&hZnZ^hs1++b{{QpN}PVL5p27jj0R=<%4UA z@{+oWnDdg{74I3DN6SB48q4CFv>|5kN$LfV?gC z3nW`K2YG>OZx$*swN)k)JDI~DjnsbAJmfF?xrvS%6T=D1W@x&sddP70eoe?2qeDOq zfjUd&{7_sDvksp`@hK`Wsx3?1?Na1^7GCbL7k-Ri)FksWY?+oo(;$+Df~I+5jhqF# zOU^OmC`(SLE9Sj*0)nrDlh}+1iF_07?v!ET^^p1mo~uh)%;j-`aTryne#i{yUBG+l zeB-PdDt*fglM)8b4+Y9q4@D5`n17RmuXpo<4%S+a@3`M z%cwg8qI>WEB{)w`N1KNWMBB_;*Cr_vun8Cau0}mm^`v8SkEOb~`|eh3t8*f^STGn& z>{s7K?#lm7-vvRServNP8)WDxKnzqbZ?sC zKmF|N3)K`XyTbtax2-<(kH!+MDTS*6G4!smau-$;)xMX=M_g#~-}WR`GkZGO z-{&A~Pvd|^w)Ur*$-9<5tj7|R?&D3PBQkWJy7NG2BzWvDj1@PH5{v`%*WdNAx87g+ z7GFotu&+?QFXuc9!#>!$CN}<(z}>OhZt~Z{mbKFT^)hPR>UD6xFCsVu=Lt2^K;_kG z$(+$^RF79~ZZ+<|xAoASWR2_(CPoMX2E;<52p9+=h7tv+5*Uok|NIl30)xTnM08xu z30oYMBF@EOo3GlYiFUU3@CdcAy*vzZORhntmrgQ&yD?WR8n3XlUTQ(@sUQ$(RlAUn zvb_9eWK1<%kkfqZ@n08&JfunjtoP?QX%Se(-3E9JRO2(KIhehO-|&8QjdwWsq$$J; zkr%_M-r6Pypw<`+4^iq;U1*ndU+DF;En0AWKS6V}*s$p6+>8B+JXt*Mu05UHxqB%O zP+p-ReJZ#e6=n=o8wHh?W@@$e?Z=h~Zz6)!G`?&rRF8m`kHZfL(MAn{$f&#@@jT_IO5^zqG*VBS*f zH$JC)DobZGR^!ssxBzsPhZ$>kx-0*afKatGxeKcuv2VtE^#an&eadJ@Se{34C|qfw zp*j|C)-mov-td$K=Hy$7X8Gv!b0umipCntNX1DT!!leZJ{NV+&B6?t`x69}@!5CR1Q zuzm7ayoHO669MM}g8ZZ*f0YUF4q1iFYeMOM(wjd_H2^_=kbJ<<+r$!?wfq-Ofkent z0xw}43FN`>0JjMFL52d9qJks90+QVm*>+Dzne5Ha!X=vm1}Yb)=V-G<7jA7bzqrON z<93)tc4*rq8SrK56cEBKf_Ix5gdpLF-_EQ8y=k`Y;xlS@OL+0%$O-G(dQp=O5jXx4N*QnKG))=5%8`2NRpU3(n#QBP#}1|J zeR=D{O#xZ?z_P*VG;V%hTsOJnb39X3)G&#pRKuisOhB(ktxYRLr%Fyr z{{gGRH?4v6FU>*TIbk-YLmPHwmAPo=#9N^vPkr{&ydE#fl33KQs$F~6bL__1GhXmJV%kL zd*#R6g2;X0?a61sEfp{}$kL?pQK`p;eO-b_SMw&m^w>{N_}AQZMZ}g|m~WUAPkJXGwp95M zFK+Pm{_`=G!9k@+i4?nDZIO9-2i)JnJk*C-9-RXa9MXgz*_CGg{wylsKAi+0tp+HXIb2;v8_7Hz$jurPTEf*E7x*j9T%g zwwf2Iped?YEa&1OJ30IB3^B8XQZ$rY0`X$D0;Qk3uh_Y$DLfjNXcik( zUL4L^`@R=$?26drTbJk^?>%{YczUS6mjPdyUzL#f_v-$UKBIWY84N$zD{vO<8To*RLwXujPv<&tVHe#l!ib9MiPfZD;o#CVj9<_?@` zo5!v7F-%oYE0yoBryXAZHiVt#d%pIRHRjai7rFr@cke3m@?#%(c;1{?;q1Lj$TGaS zNpYci{c9lv;+l2+!Q)B&=0S~f$Gc;vyKjx8)s$Kkwf7AZkrlhorLA>M-Y-!ATKKuG zLw9cL|Aa6gqJQAlmk>$-n7&F~(3r&RPyIi|s1y{A6x)I-cXXZZDxU4QlAc89oTQE-#l&d_Eer>@> zvwHK2+R4fNVKO$B3Q|F~3Aq7vS-JZnS89DslhBv+uQnPb@~`qXv*78pNFu%Xi%QF@ zY5j6;LkoW;lsP8v4iAic2ef^FZl8rHF)&C728IKtHc&7C7X2T=CvdR-FTkgemKw7h z4^6ihT^PtOJUJVdBy2Wa>bvp2;b~Sq^WiYfI+~BHv~v2Hw+F$uh~kiC!}w*J7MA_y3rt{cPWnUUlpT zi`=XYQIr@^c6eRPtD43tsNxBL){vxfp_(*iJ?&kG!XX_MJEUO4?iQ@A2uW zzD7W!qCuwbyU}=l1IO3j%q6Vd9)E3}dl7n(y1$|9UMHjd!8!Zdn}S&!O^hMspr2)RPQFfbCd)c}(# z5h}E!{C~s+(2++#6+m0t)J~(wWxAbBIh-p1`yn6^H5<#VUW43G0~NF#vL8R;_}e1b zj*k0psk;mIG98%Q7l40#KnOS-1_xaR{gROzEAj{AxGh5>$l?A&CJMZ@OC|=?=bw5g z0BHXqgOk~%KV_nT&A-dQV8pIDz%Uq)tv~b-s9p1dp+MYs>Fu@;4o3ii{(U^ayxlTU zasmCWCkg@T?hhGQjNDZIt_K%G?$Q$#-EE&JM0EEYVgN7l$9PBxVpj~r#E`q@MS_8~ z_Q$&j6!`}gf7plI9cQE{Pyv7Fi9vu4?GG6OiQ26vO73WXp96_N?x-Px3kK*oUABIH vw8aW+6)dnqv~f6coo}sF9UD(9xhDh)7*D{s5Vpz}DF#N-@$xEYDAN5ui7Nu< literal 0 HcmV?d00001 diff --git a/lib/matplotlib/tests/test_text.py b/lib/matplotlib/tests/test_text.py index ab4a2f07df64..9b7d1ce047bb 100644 --- a/lib/matplotlib/tests/test_text.py +++ b/lib/matplotlib/tests/test_text.py @@ -748,3 +748,11 @@ def test_pdf_font42_kerning(): plt.rcParams['pdf.fonttype'] = 42 plt.figure() plt.figtext(0.1, 0.5, "ATAVATAVATAVATAVATA", size=30) + + +@image_comparison(['text_pdf_chars_beyond_bmp.pdf'], style='mpl20') +def test_pdf_chars_beyond_bmp(): + plt.rcParams['pdf.fonttype'] = 42 + plt.rcParams['mathtext.fontset'] = 'stixsans' + plt.figure() + plt.figtext(0.1, 0.5, "Mass $m$ \U00010308", size=30) From a4067a0b2ff710b7eaaeba857cc56be33dbe6c64 Mon Sep 17 00:00:00 2001 From: Frank Sauerburger Date: Thu, 15 Jul 2021 09:18:35 +0200 Subject: [PATCH 2/2] Emit Type 42 chars beyond BMP as XObjects Currently, the CID maps only support 2-byte fixed-width characters. Unicode points beyond the Basic Multilingual Plane cannot be used. This comment follows the strategy taken for Type 3 fonts. Any char with a code point > 65535 is emitted as an XObject. --- lib/matplotlib/backends/backend_pdf.py | 81 +++++++++++++++++++------- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/lib/matplotlib/backends/backend_pdf.py b/lib/matplotlib/backends/backend_pdf.py index d4cde3155af4..48c01570719c 100644 --- a/lib/matplotlib/backends/backend_pdf.py +++ b/lib/matplotlib/backends/backend_pdf.py @@ -321,6 +321,21 @@ def pdfRepr(obj): .format(type(obj))) +def _font_supports_char(fonttype, char): + """ + Returns True if the font is able to provide *char* in a PDF. + + For a Type 3 font, this method returns True only for single-byte + chars. For Type 42 fonts this method return True if the char is from + the Basic Multilingual Plane. + """ + if fonttype == 3: + return ord(char) <= 255 + if fonttype == 42: + return ord(char) <= 65535 + raise NotImplementedError() + + class Reference: """ PDF reference object. @@ -1268,6 +1283,11 @@ def embedTTFType42(font, characters, descriptor): unicode_bfrange = [] for start, end in unicode_groups: + # Ensure the CID map contains only chars from BMP + if start > 65535: + continue + end = min(65535, end) + unicode_bfrange.append( b"<%04x> <%04x> [%s]" % (start, end, @@ -1275,6 +1295,36 @@ def embedTTFType42(font, characters, descriptor): unicode_cmap = (self._identityToUnicodeCMap % (len(unicode_groups), b"\n".join(unicode_bfrange))) + # Add XObjects for unsupported chars + glyph_ids = [] + for ccode in characters: + if not _font_supports_char(fonttype, chr(ccode)): + gind = font.get_char_index(ccode) + glyph_ids.append(gind) + + bbox = [cvt(x, nearest=False) for x in font.bbox] + rawcharprocs = _get_pdf_charprocs(filename, glyph_ids) + for charname in sorted(rawcharprocs): + stream = rawcharprocs[charname] + charprocDict = {'Length': len(stream)} + charprocDict['Type'] = Name('XObject') + charprocDict['Subtype'] = Name('Form') + charprocDict['BBox'] = bbox + # Each glyph includes bounding box information, + # but xpdf and ghostscript can't handle it in a + # Form XObject (they segfault!!!), so we remove it + # from the stream here. It's not needed anyway, + # since the Form XObject includes it in its BBox + # value. + stream = stream[stream.find(b"d1") + 2:] + charprocObject = self.reserveObject('charProc') + self.beginStream(charprocObject.id, None, charprocDict) + self.currentstream.write(stream) + self.endStream() + + name = self._get_xobject_symbol_name(filename, charname) + self.multi_byte_charprocs[name] = charprocObject + # CIDToGIDMap stream cid_to_gid_map = "".join(cid_to_gid_map).encode("utf-16be") self.beginStream(cidToGidMapObject.id, @@ -2106,16 +2156,17 @@ def draw_mathtext(self, gc, x, y, s, prop, angle): self.check_gc(gc, gc._rgb) prev_font = None, None oldx, oldy = 0, 0 - type3_multibytes = [] + unsupported_chars = [] self.file.output(Op.begin_text) for font, fontsize, num, ox, oy in glyphs: - self.file._character_tracker.track(font, chr(num)) + char = chr(num) + self.file._character_tracker.track(font, char) fontname = font.fname - if fonttype == 3 and num > 255: - # For Type3 fonts, multibyte characters must be emitted - # separately (below). - type3_multibytes.append((font, fontsize, ox, oy, num)) + if not _font_supports_char(fonttype, char): + # Unsupported chars (i.e. multibyte in Type 3 or beyond BMP in + # Type 42) must be emitted separately (below). + unsupported_chars.append((font, fontsize, ox, oy, num)) else: self._setup_textpos(ox, oy, 0, oldx, oldy) oldx, oldy = ox, oy @@ -2127,7 +2178,7 @@ def draw_mathtext(self, gc, x, y, s, prop, angle): Op.show) self.file.output(Op.end_text) - for font, fontsize, ox, oy, num in type3_multibytes: + for font, fontsize, ox, oy, num in unsupported_chars: self._draw_xobject_glyph( font, fontsize, font.get_char_index(num), ox, oy) @@ -2236,20 +2287,6 @@ def encode_string(self, s, fonttype): return s.encode('cp1252', 'replace') return s.encode('utf-16be', 'replace') - @staticmethod - def _font_supports_char(fonttype, char): - """ - Returns True if the font is able to provided the char in a PDF - - For a Type 3 font, this method returns True only for single-byte - chars. For Type 42 fonts this method always returns True. - """ - if fonttype == 3: - return ord(char) <= 255 - if fonttype == 42: - return True - raise NotImplementedError() - def draw_text(self, gc, x, y, s, prop, angle, ismath=False, mtext=None): # docstring inherited @@ -2313,7 +2350,7 @@ def draw_text(self, gc, x, y, s, prop, angle, ismath=False, mtext=None): prev_was_multibyte = True for item in _text_helpers.layout( s, font, kern_mode=KERNING_UNFITTED): - if self._font_supports_char(fonttype, item.char): + if _font_supports_char(fonttype, item.char): if prev_was_multibyte: singlebyte_chunks.append((item.x, [])) if item.prev_kern: