fixed /Encoding in the output of ConvertType1FontsToType1C; this fixes #50

Peter Szabo · Peter Szabo · commit cbb88d495ae9 · 2017-10-04T09:14:02.000+02:00
diff --git a/lib/pdfsizeopt/main.py b/lib/pdfsizeopt/main.py
@@ -5370,8 +5370,34 @@ def GetFonts(self, font_type=None,
   @classmethod
   def GenerateType1CFontsFromType1(cls, objs, ref_objs, ps_tmp_file_name,
                                    pdf_tmp_file_name):
+    """Converts objs with Type1 font programs to Type1C font programs.
+
+    Uses Ghostscript to do the conversion.
+
+    Removes the temporary files it creates.
+
+    As a side effect, the /Encoding field in the returned Type1C font
+    programs is useless (it's not the same as in the input Type1 font
+    programs). This is a limitation of Ghostscript. As a workaround, the
+    correct /Encoding values are returned in `encodings'.
+
+    Fonts with more than 256 glyphs in their /CharStrings are not converted.
+    This is a limitation of Ghostscript.
+
+    Args:
+      objs: dict mapping obj numbers to PdfObj objects containing a Type1
+        font program.
+      ref_objs: dict containing objs to be used when following references.
+      ps_tmp_file_name: Temporary .ps filename to create. Will get removed
+        unless an exception is raised.
+      pdf_tmp_file_name: Temporary .pdf filename to create. Willget removed
+        unless an exception is raised.
+    Returns:
+      (type1c_objs, encodings), where keys in both type1c_objs and encodings
+      are the keys in objs (obj numbers).
+    """
     if not objs:
-      return {}
+      return {}, {}
     output = ['%!PS-Adobe-3.0\n',
               '% Ghostscript helper for converting Type1 fonts to Type1C\n',
               '%% autogenerated by %s at %s\n' % ('pdfsizeopt', time.time())]
@@ -5425,16 +5451,42 @@ def GenerateType1CFontsFromType1(cls, objs, ref_objs, ps_tmp_file_name,
         'info: executing Type1CConverter with Ghostscript: %s' % gs_cmd)
     sys.stdout.flush()
     p = os.popen(gs_cmd, 'rb')
+    encoding_prefix = 'obj encoding '
     skip_prefix = 'skipping big-CharStrings font obj '
     big_charstrings_obj_nums = set()
+    encodings = {}
     try:
       for line in iter(p.readline, ''):
         if line.startswith(skip_prefix):
           obj_num = int(line[len(skip_prefix):])
           big_charstrings_obj_nums.add(obj_num)
+        elif line.startswith(encoding_prefix):
+          obj_num, data = line[len(encoding_prefix):].split(' ', 1)
+          obj_num = int(obj_num)
+          data = data.strip().replace('#', '#23')
+          ## This escapes eg. * to #2A.
+          data = PdfObj.PDF_HEXTOKENS_SAFE_HEX_ESCAPE_RE.sub(
+              lambda match: '#%02X' % ord(match.group()), data)
+          encoding = PdfObj.ParseArray(data)
+          for i in xrange(len(encoding)):
+            char_name = encoding[i]
+            if char_name is None:
+              encoding[i] = '/.notdef'
+            else:
+              char_name = str(char_name)
+              assert char_name.startswith('/'), [char_name]
+              encoding[i] = str(char_name)
+          encoding.extend('/.notdef' for i in xrange(len(encoding), 256))
+          if len(encoding) > 256:
+            raise ValueError('Encoding for obj %d too long.' % obj_num)
+          encodings[obj_num] = encoding
         else:
           sys.stdout.write(line)
     finally:
+      try:
+        p.read()
+      except IOError:
+        pass
       status = p.close()
     sys.stdout.flush()
     if status:
@@ -5468,16 +5520,18 @@ def GenerateType1CFontsFromType1(cls, objs, ref_objs, ps_tmp_file_name,
     for obj_num in type1c_objs:
       # TODO(pts): Also cross-check /FontFile3 with pdf.GetFonts.
       if type1c_objs[obj_num].Get('Subtype') != '/Type1C':
-        raise ValueError('Could not convert font %s to Type1C.' % obj_num)
+        raise ValueError('Could not convert font obj %d to Type1C.' % obj_num)
       type1c_size += type1c_objs[obj_num].size
+      if obj_num not in encodings:
+        raise ValueError('Missing encoding for font obj %d.' % obj_num)
     # TODO(pts): Don't remove if command-line flag.
     os.remove(pdf_tmp_file_name)
     # TODO(pts): Undo if no reduction in size.
     print >>sys.stderr, (
         'info: optimized total Type1 font size %s to Type1C font size %s '
         '(%s)' %
         (type1_size, type1c_size, FormatPercent(type1c_size, type1_size)))
-    return type1c_objs
+    return type1c_objs, encodings
 
 
   @classmethod
@@ -5621,11 +5675,11 @@ def MoveToPrivate(parsed_font, key):
   def ConvertType1FontsToType1C(self):
     """Convert all Type1 fonts to Type1C in self, returns self."""
     # GenerateType1CFontsFromType1 removes the tmp files it creates.
-    type1c_objs = self.GenerateType1CFontsFromType1(
+    type1c_objs, encodings = self.GenerateType1CFontsFromType1(
         self.GetFonts('Type1'), self.objs,
         TMP_PREFIX + 'conv.tmp.ps', TMP_PREFIX + 'conv.tmp.pdf')
     for obj_num in type1c_objs:
-      obj = self.objs[obj_num]
+      obj = self.objs[obj_num]  # obj.get('Type') == 'FontDescriptor'.
       assert str(obj.Get('FontName')).startswith('/')
       type1c_obj = type1c_objs[obj_num]
       type1c_obj.FixFontNameInType1C(objs=self.objs)
@@ -5646,10 +5700,33 @@ def ConvertType1FontsToType1C(self):
              FormatPercent(new_size, old_size)))
       else:
         # TODO(pts): How to optimize/unify these?
+        # TODO(pts): Don't keep, prevents further optimizations.
         print >>sys.stderr, (
             'info: keeping original Type1 font XObject %s,%s, '
             'replacement too large: old size=%s, new size=%s' %
             (obj_num, font_file_obj_num, old_size, new_size))
+        encodings.pop(obj_num, None)
+
+    # Update encodings.
+    if encodings:
+      for obj_num in sorted(self.objs):
+        obj = self.objs[obj_num]
+        head = obj.head
+        if (head.startswith('<<') and
+            '/Font' in head and '/Type' in head and
+            '/Type1' in head and '/Subtype' in head and
+            '/FontDescriptor' in head and
+            obj.Get('Type') == '/Font' and
+            obj.Get('Subtype') == '/Type1'):
+          match = obj.PDF_REF_AT_EOS_RE.match(str(obj.Get('FontDescriptor')))
+          if match:
+            fd_obj_num = int(match.group(1))  # /Type/FontDescriptor.
+            if (fd_obj_num in encodings and
+                self.IsFontBuiltInEncodingUsed(
+                    obj.ResolveReferences(obj.Get('Encoding'),
+                    objs=self.objs)[0])):
+              obj.Set('Encoding', self.FormatEncoding(encodings[fd_obj_num]))
+
     return self
 
   @classmethod
@@ -6070,7 +6147,8 @@ def _ProcessType1CFonts(self, type1c_objs, do_unify_fonts,
     for obj_num in sorted(self.objs):
       obj = self.objs[obj_num]
       head = obj.head
-      if ('/Font' in head and '/Type' in head and
+      if (head.startswith('<<') and
+          '/Font' in head and '/Type' in head and
           '/Type1' in head and '/Subtype' in head and
           '/FontDescriptor' in head and
           obj.Get('Type') == '/Font' and
diff --git a/lib/pdfsizeopt/psproc.py b/lib/pdfsizeopt/psproc.py
@@ -345,6 +345,9 @@
   % As a workaround for `S1' above, we skip a font with too many
   % /CharStrings.
   dup /CharStrings get length 256 lt {
+    (obj encoding ) print _ObjNumber ===only ( ) print
+    dup /Encoding .knownget not {[]} if ===
+
     % Create /Encoding from sorted keys of /CharStrings.
     [1 index /CharStrings get {pop} forall] NameSort
     % Pad it to size 256.