From a01860688d2d1a01c1f808983c15a170aa90f099 Mon Sep 17 00:00:00 2001 From: Elliott Sales de Andrade Date: Tue, 3 Jun 2025 00:32:10 -0400 Subject: [PATCH] Add typing to AFM parser Also, check some expected conditions at parse time instead of somewhere during use of the data. --- lib/matplotlib/_afm.py | 231 ++++++++++++++------------ lib/matplotlib/backends/backend_ps.py | 2 +- lib/matplotlib/tests/test_afm.py | 49 +++--- 3 files changed, 154 insertions(+), 128 deletions(-) diff --git a/lib/matplotlib/_afm.py b/lib/matplotlib/_afm.py index 9094206c2d7c..352d3c42247e 100644 --- a/lib/matplotlib/_afm.py +++ b/lib/matplotlib/_afm.py @@ -27,9 +27,10 @@ being used. """ -from collections import namedtuple +import inspect import logging import re +from typing import BinaryIO, NamedTuple, TypedDict from ._mathtext_data import uni2type1 @@ -37,7 +38,7 @@ _log = logging.getLogger(__name__) -def _to_int(x): +def _to_int(x: bytes | str) -> int: # Some AFM files have floats where we are expecting ints -- there is # probably a better way to handle this (support floats, round rather than # truncate). But I don't know what the best approach is now and this @@ -46,7 +47,7 @@ def _to_int(x): return int(float(x)) -def _to_float(x): +def _to_float(x: bytes | str) -> float: # Some AFM files use "," instead of "." as decimal separator -- this # shouldn't be ambiguous (unless someone is wicked enough to use "," as # thousands separator...). @@ -57,27 +58,56 @@ def _to_float(x): return float(x.replace(',', '.')) -def _to_str(x): +def _to_str(x: bytes) -> str: return x.decode('utf8') -def _to_list_of_ints(s): +def _to_list_of_ints(s: bytes) -> list[int]: s = s.replace(b',', b' ') return [_to_int(val) for val in s.split()] -def _to_list_of_floats(s): +def _to_list_of_floats(s: bytes | str) -> list[float]: return [_to_float(val) for val in s.split()] -def _to_bool(s): +def _to_bool(s: bytes) -> bool: if s.lower().strip() in (b'false', b'0', b'no'): return False else: return True -def _parse_header(fh): +class FontMetricsHeader(TypedDict, total=False): + StartFontMetrics: float + FontName: str + FullName: str + FamilyName: str + Weight: str + ItalicAngle: float + IsFixedPitch: bool + FontBBox: list[int] + UnderlinePosition: float + UnderlineThickness: float + Version: str + # Some AFM files have non-ASCII characters (which are not allowed by the spec). + # Given that there is actually no public API to even access this field, just return + # it as straight bytes. + Notice: bytes + EncodingScheme: str + CapHeight: float # Is the second version a mistake, or + Capheight: float # do some AFM files contain 'Capheight'? -JKS + XHeight: float + Ascender: float + Descender: float + StdHW: float + StdVW: float + StartCharMetrics: int + CharacterSet: str + Characters: int + + +def _parse_header(fh: BinaryIO) -> FontMetricsHeader: """ Read the font metrics header (up to the char metrics). @@ -98,34 +128,15 @@ def _parse_header(fh): * '-168 -218 1000 898' -> [-168, -218, 1000, 898] """ header_converters = { - b'StartFontMetrics': _to_float, - b'FontName': _to_str, - b'FullName': _to_str, - b'FamilyName': _to_str, - b'Weight': _to_str, - b'ItalicAngle': _to_float, - b'IsFixedPitch': _to_bool, - b'FontBBox': _to_list_of_ints, - b'UnderlinePosition': _to_float, - b'UnderlineThickness': _to_float, - b'Version': _to_str, - # Some AFM files have non-ASCII characters (which are not allowed by - # the spec). Given that there is actually no public API to even access - # this field, just return it as straight bytes. - b'Notice': lambda x: x, - b'EncodingScheme': _to_str, - b'CapHeight': _to_float, # Is the second version a mistake, or - b'Capheight': _to_float, # do some AFM files contain 'Capheight'? -JKS - b'XHeight': _to_float, - b'Ascender': _to_float, - b'Descender': _to_float, - b'StdHW': _to_float, - b'StdVW': _to_float, - b'StartCharMetrics': _to_int, - b'CharacterSet': _to_str, - b'Characters': _to_int, + bool: _to_bool, + bytes: lambda x: x, + float: _to_float, + int: _to_int, + list[int]: _to_list_of_ints, + str: _to_str, } - d = {} + header_value_types = inspect.get_annotations(FontMetricsHeader) + d: FontMetricsHeader = {} first_line = True for line in fh: line = line.rstrip() @@ -147,14 +158,16 @@ def _parse_header(fh): else: val = b'' try: - converter = header_converters[key] - except KeyError: + key_str = _to_str(key) + value_type = header_value_types[key_str] + except (KeyError, UnicodeDecodeError): _log.error("Found an unknown keyword in AFM header (was %r)", key) continue try: - d[key] = converter(val) + converter = header_converters[value_type] + d[key_str] = converter(val) # type: ignore[literal-required] except ValueError: - _log.error('Value error parsing header in AFM: %s, %s', key, val) + _log.error('Value error parsing header in AFM: %r, %r', key, val) continue if key == b'StartCharMetrics': break @@ -163,8 +176,8 @@ def _parse_header(fh): return d -CharMetrics = namedtuple('CharMetrics', 'width, name, bbox') -CharMetrics.__doc__ = """ +class CharMetrics(NamedTuple): + """ Represents the character metrics of a single character. Notes @@ -172,13 +185,20 @@ def _parse_header(fh): The fields do currently only describe a subset of character metrics information defined in the AFM standard. """ + + width: float + name: str + bbox: tuple[int, int, int, int] + + CharMetrics.width.__doc__ = """The character width (WX).""" CharMetrics.name.__doc__ = """The character name (N).""" CharMetrics.bbox.__doc__ = """ The bbox of the character (B) as a tuple (*llx*, *lly*, *urx*, *ury*).""" -def _parse_char_metrics(fh): +def _parse_char_metrics(fh: BinaryIO) -> tuple[dict[int, CharMetrics], + dict[str, CharMetrics]]: """ Parse the given filehandle for character metrics information. @@ -198,12 +218,12 @@ def _parse_char_metrics(fh): """ required_keys = {'C', 'WX', 'N', 'B'} - ascii_d = {} - name_d = {} - for line in fh: + ascii_d: dict[int, CharMetrics] = {} + name_d: dict[str, CharMetrics] = {} + for bline in fh: # We are defensively letting values be utf8. The spec requires # ascii, but there are non-compliant fonts in circulation - line = _to_str(line.rstrip()) # Convert from byte-literal + line = _to_str(bline.rstrip()) if line.startswith('EndCharMetrics'): return ascii_d, name_d # Split the metric line into a dictionary, keyed by metric identifiers @@ -214,8 +234,9 @@ def _parse_char_metrics(fh): num = _to_int(vals['C']) wx = _to_float(vals['WX']) name = vals['N'] - bbox = _to_list_of_floats(vals['B']) - bbox = list(map(int, bbox)) + bbox = tuple(map(int, _to_list_of_floats(vals['B']))) + if len(bbox) != 4: + raise RuntimeError(f'Bad parse: bbox has {len(bbox)} elements, should be 4') metrics = CharMetrics(wx, name, bbox) # Workaround: If the character name is 'Euro', give it the # corresponding character code, according to WinAnsiEncoding (see PDF @@ -230,7 +251,7 @@ def _parse_char_metrics(fh): raise RuntimeError('Bad parse') -def _parse_kern_pairs(fh): +def _parse_kern_pairs(fh: BinaryIO) -> dict[tuple[str, str], float]: """ Return a kern pairs dictionary. @@ -242,12 +263,11 @@ def _parse_kern_pairs(fh): d['A', 'y'] = -50 """ - line = next(fh) if not line.startswith(b'StartKernPairs'): - raise RuntimeError('Bad start of kern pairs data: %s' % line) + raise RuntimeError(f'Bad start of kern pairs data: {line!r}') - d = {} + d: dict[tuple[str, str], float] = {} for line in fh: line = line.rstrip() if not line: @@ -257,21 +277,26 @@ def _parse_kern_pairs(fh): return d vals = line.split() if len(vals) != 4 or vals[0] != b'KPX': - raise RuntimeError('Bad kern pairs line: %s' % line) + raise RuntimeError(f'Bad kern pairs line: {line!r}') c1, c2, val = _to_str(vals[1]), _to_str(vals[2]), _to_float(vals[3]) d[(c1, c2)] = val raise RuntimeError('Bad kern pairs parse') -CompositePart = namedtuple('CompositePart', 'name, dx, dy') -CompositePart.__doc__ = """ - Represents the information on a composite element of a composite char.""" +class CompositePart(NamedTuple): + """Represents the information on a composite element of a composite char.""" + + name: bytes + dx: float + dy: float + + CompositePart.name.__doc__ = """Name of the part, e.g. 'acute'.""" CompositePart.dx.__doc__ = """x-displacement of the part from the origin.""" CompositePart.dy.__doc__ = """y-displacement of the part from the origin.""" -def _parse_composites(fh): +def _parse_composites(fh: BinaryIO) -> dict[bytes, list[CompositePart]]: """ Parse the given filehandle for composites information. @@ -292,11 +317,11 @@ def _parse_composites(fh): will be represented as:: - composites['Aacute'] = [CompositePart(name='A', dx=0, dy=0), - CompositePart(name='acute', dx=160, dy=170)] + composites[b'Aacute'] = [CompositePart(name=b'A', dx=0, dy=0), + CompositePart(name=b'acute', dx=160, dy=170)] """ - composites = {} + composites: dict[bytes, list[CompositePart]] = {} for line in fh: line = line.rstrip() if not line: @@ -306,6 +331,9 @@ def _parse_composites(fh): vals = line.split(b';') cc = vals[0].split() name, _num_parts = cc[1], _to_int(cc[2]) + if len(vals) != _num_parts + 2: # First element is 'CC', last is empty. + raise RuntimeError(f'Bad composites parse: expected {_num_parts} parts, ' + f'but got {len(vals) - 2}') pccParts = [] for s in vals[1:-1]: pcc = s.split() @@ -316,7 +344,8 @@ def _parse_composites(fh): raise RuntimeError('Bad composites parse') -def _parse_optional(fh): +def _parse_optional(fh: BinaryIO) -> tuple[dict[tuple[str, str], float], + dict[bytes, list[CompositePart]]]: """ Parse the optional fields for kern pair data and composites. @@ -329,44 +358,38 @@ def _parse_optional(fh): A dict containing composite information. May be empty. See `._parse_composites`. """ - optional = { - b'StartKernData': _parse_kern_pairs, - b'StartComposites': _parse_composites, - } - - d = {b'StartKernData': {}, - b'StartComposites': {}} + kern_data: dict[tuple[str, str], float] = {} + composites: dict[bytes, list[CompositePart]] = {} for line in fh: line = line.rstrip() if not line: continue - key = line.split()[0] - - if key in optional: - d[key] = optional[key](fh) + match line.split()[0]: + case b'StartKernData': + kern_data = _parse_kern_pairs(fh) + case b'StartComposites': + composites = _parse_composites(fh) - return d[b'StartKernData'], d[b'StartComposites'] + return kern_data, composites class AFM: - def __init__(self, fh): + def __init__(self, fh: BinaryIO): """Parse the AFM file in file object *fh*.""" self._header = _parse_header(fh) self._metrics, self._metrics_by_name = _parse_char_metrics(fh) self._kern, self._composite = _parse_optional(fh) - def get_str_bbox_and_descent(self, s): + def get_str_bbox_and_descent(self, s: str) -> tuple[int, int, float, int, int]: """Return the string bounding box and the maximal descent.""" if not len(s): return 0, 0, 0, 0, 0 - total_width = 0 - namelast = None - miny = 1e9 + total_width = 0.0 + namelast = '' + miny = 1_000_000_000 maxy = 0 left = 0 - if not isinstance(s, str): - s = _to_str(s) for c in s: if c == '\n': continue @@ -386,11 +409,11 @@ def get_str_bbox_and_descent(self, s): return left, miny, total_width, maxy - miny, -miny - def get_glyph_name(self, glyph_ind): # For consistency with FT2Font. + def get_glyph_name(self, glyph_ind: int) -> str: # For consistency with FT2Font. """Get the name of the glyph, i.e., ord(';') is 'semicolon'.""" return self._metrics[glyph_ind].name - def get_char_index(self, c): # For consistency with FT2Font. + def get_char_index(self, c: int) -> int: # For consistency with FT2Font. """ Return the glyph index corresponding to a character code point. @@ -398,38 +421,38 @@ def get_char_index(self, c): # For consistency with FT2Font. """ return c - def get_width_char(self, c): + def get_width_char(self, c: int) -> float: """Get the width of the character code from the character metric WX field.""" return self._metrics[c].width - def get_width_from_char_name(self, name): + def get_width_from_char_name(self, name: str) -> float: """Get the width of the character from a type1 character name.""" return self._metrics_by_name[name].width - def get_kern_dist_from_name(self, name1, name2): + def get_kern_dist_from_name(self, name1: str, name2: str) -> float: """ Return the kerning pair distance (possibly 0) for chars *name1* and *name2*. """ return self._kern.get((name1, name2), 0) - def get_fontname(self): + def get_fontname(self) -> str: """Return the font name, e.g., 'Times-Roman'.""" - return self._header[b'FontName'] + return self._header['FontName'] @property - def postscript_name(self): # For consistency with FT2Font. + def postscript_name(self) -> str: # For consistency with FT2Font. return self.get_fontname() - def get_fullname(self): + def get_fullname(self) -> str: """Return the font full name, e.g., 'Times-Roman'.""" - name = self._header.get(b'FullName') + name = self._header.get('FullName') if name is None: # use FontName as a substitute - name = self._header[b'FontName'] + name = self._header['FontName'] return name - def get_familyname(self): + def get_familyname(self) -> str: """Return the font family name, e.g., 'Times'.""" - name = self._header.get(b'FamilyName') + name = self._header.get('FamilyName') if name is not None: return name @@ -440,26 +463,26 @@ def get_familyname(self): return re.sub(extras, '', name) @property - def family_name(self): # For consistency with FT2Font. + def family_name(self) -> str: # For consistency with FT2Font. """The font family name, e.g., 'Times'.""" return self.get_familyname() - def get_weight(self): + def get_weight(self) -> str: """Return the font weight, e.g., 'Bold' or 'Roman'.""" - return self._header[b'Weight'] + return self._header['Weight'] - def get_angle(self): + def get_angle(self) -> float: """Return the fontangle as float.""" - return self._header[b'ItalicAngle'] + return self._header['ItalicAngle'] - def get_capheight(self): + def get_capheight(self) -> float: """Return the cap height as float.""" - return self._header[b'CapHeight'] + return self._header['CapHeight'] - def get_xheight(self): + def get_xheight(self) -> float: """Return the xheight as float.""" - return self._header[b'XHeight'] + return self._header['XHeight'] - def get_underline_thickness(self): + def get_underline_thickness(self) -> float: """Return the underline thickness as float.""" - return self._header[b'UnderlineThickness'] + return self._header['UnderlineThickness'] diff --git a/lib/matplotlib/backends/backend_ps.py b/lib/matplotlib/backends/backend_ps.py index ea5868387918..368564a1518d 100644 --- a/lib/matplotlib/backends/backend_ps.py +++ b/lib/matplotlib/backends/backend_ps.py @@ -779,7 +779,7 @@ def draw_text(self, gc, x, y, s, prop, angle, ismath=False, mtext=None): .decode("ascii")) scale = 0.001 * prop.get_size_in_points() thisx = 0 - last_name = None # kerns returns 0 for None. + last_name = '' # kerns returns 0 for ''. for c in s: name = uni2type1.get(ord(c), f"uni{ord(c):04X}") try: diff --git a/lib/matplotlib/tests/test_afm.py b/lib/matplotlib/tests/test_afm.py index 80cf8ac60feb..bc1d587baf6b 100644 --- a/lib/matplotlib/tests/test_afm.py +++ b/lib/matplotlib/tests/test_afm.py @@ -47,20 +47,20 @@ def test_parse_header(): fh = BytesIO(AFM_TEST_DATA) header = _afm._parse_header(fh) assert header == { - b'StartFontMetrics': 2.0, - b'FontName': 'MyFont-Bold', - b'EncodingScheme': 'FontSpecific', - b'FullName': 'My Font Bold', - b'FamilyName': 'Test Fonts', - b'Weight': 'Bold', - b'ItalicAngle': 0.0, - b'IsFixedPitch': False, - b'UnderlinePosition': -100, - b'UnderlineThickness': 56.789, - b'Version': '001.000', - b'Notice': b'Copyright \xa9 2017 No one.', - b'FontBBox': [0, -321, 1234, 369], - b'StartCharMetrics': 3, + 'StartFontMetrics': 2.0, + 'FontName': 'MyFont-Bold', + 'EncodingScheme': 'FontSpecific', + 'FullName': 'My Font Bold', + 'FamilyName': 'Test Fonts', + 'Weight': 'Bold', + 'ItalicAngle': 0.0, + 'IsFixedPitch': False, + 'UnderlinePosition': -100, + 'UnderlineThickness': 56.789, + 'Version': '001.000', + 'Notice': b'Copyright \xa9 2017 No one.', + 'FontBBox': [0, -321, 1234, 369], + 'StartCharMetrics': 3, } @@ -69,20 +69,23 @@ def test_parse_char_metrics(): _afm._parse_header(fh) # position metrics = _afm._parse_char_metrics(fh) assert metrics == ( - {0: (250.0, 'space', [0, 0, 0, 0]), - 42: (1141.0, 'foo', [40, 60, 800, 360]), - 99: (583.0, 'bar', [40, -10, 543, 210]), - }, - {'space': (250.0, 'space', [0, 0, 0, 0]), - 'foo': (1141.0, 'foo', [40, 60, 800, 360]), - 'bar': (583.0, 'bar', [40, -10, 543, 210]), - }) + { + 0: _afm.CharMetrics(250.0, 'space', (0, 0, 0, 0)), + 42: _afm.CharMetrics(1141.0, 'foo', (40, 60, 800, 360)), + 99: _afm.CharMetrics(583.0, 'bar', (40, -10, 543, 210)), + }, + { + 'space': _afm.CharMetrics(250.0, 'space', (0, 0, 0, 0)), + 'foo': _afm.CharMetrics(1141.0, 'foo', (40, 60, 800, 360)), + 'bar': _afm.CharMetrics(583.0, 'bar', (40, -10, 543, 210)), + } + ) def test_get_familyname_guessed(): fh = BytesIO(AFM_TEST_DATA) font = _afm.AFM(fh) - del font._header[b'FamilyName'] # remove FamilyName, so we have to guess + del font._header['FamilyName'] # remove FamilyName, so we have to guess assert font.get_familyname() == 'My Font'