monolisa-nerdfont-patch/bin/scripts/name_parser/FontnameTools.py

#!/usr/bin/env python
# coding=utf8

import re
import sys


class FontnameTools:
    """Deconstruct a font filename to get standardized name parts"""

    @staticmethod
    def front_upper(word):
        """Capitalize a string (but keep case of subsequent chars)"""
        return word[:1].upper() + word[1:]

    @staticmethod
    def camel_casify(word):
        """Remove blanks and use CamelCase for the new word"""
        return "".join(map(FontnameTools.front_upper, word.split(" ")))

    @staticmethod
    def camel_explode(word):
        """Explode CamelCase -> Camel Case"""
        # But do not explode "JetBrains" etc at string start...
        excludes = [
            "JetBrains",
            "DejaVu",
            "OpenDyslexicAlta",
            "OpenDyslexicMono",
            "OpenDyslexic",
            "DaddyTimeMono",
            "InconsolataGo",
            "ProFontWindows",
            "ProFont",
            "ProggyClean",
        ]
        m = re.match("(" + "|".join(excludes) + ")(.*)", word)
        (prefix, word) = m.group(1, 2) if m != None else ("", word)
        if len(word) == 0:
            return prefix
        parts = re.split("(?<=[a-z0-9])(?=[A-Z])", word)
        if len(prefix):
            parts.insert(0, prefix)
        return " ".join(parts)

    @staticmethod
    def drop_empty(l):
        """Remove empty strings from list of strings"""
        return [x for x in l if len(x) > 0]

    @staticmethod
    def concat(*all_things):
        """Flatten list of (strings or lists of strings) to a blank-separated string"""
        all = []
        for thing in all_things:
            if type(thing) is not list:
                all.append(thing)
            else:
                all += thing
        return " ".join(FontnameTools.drop_empty(all))

    @staticmethod
    def unify_style_names(style_name):
        """Substitude some known token with standard wording"""
        known_names = {
            # Source of the table is the current sourcefonts
            # Left side needs to be lower case
            "-": "",
            "book": "",
            "text": "",
            "ce": "CE",
            #'semibold':     'Demi',
            "ob": "Oblique",
            "it": "Italic",
            "i": "Italic",
            "b": "Bold",
            "normal": "Regular",
            "c": "Condensed",
            "r": "Regular",
            "m": "Medium",
            "l": "Light",
        }
        if style_name in known_names:
            return known_names[style_name.lower()]
        return style_name

    @staticmethod
    def find_in_dicts(key, dicts):
        """Find an entry in a list of dicts, return entry and in which list it was"""
        for i, d in enumerate(dicts):
            if key in d:
                return (d[key], i)
        return (None, 0)

    @staticmethod
    def get_shorten_form_idx(aggressive, prefix, form_if_prefixed):
        """Get the tuple index of known_* data tables"""
        if aggressive:
            return 0
        if len(prefix):
            return form_if_prefixed
        return 1

    @staticmethod
    def shorten_style_name(name, aggressive):
        """Substitude some known styles to short form"""
        # If aggressive is False create the mild short form
        # aggressive == True: Always use first form of everything
        # aggressive == False:
        #               - has no modifier: use the second form
        #               - has modifier: use second form of mod plus first form of weights2
        #               - has modifier: use second form of mod plus second form of widths
        name_rest = name
        name_pre = ""
        form = FontnameTools.get_shorten_form_idx(aggressive, "", 0)
        for mod in FontnameTools.known_modifiers:
            if name.startswith(mod) and len(name) > len(
                mod
            ):  # Second condition specifically for 'Demi'
                name_pre = FontnameTools.known_modifiers[mod][form]
                name_rest = name[len(mod) :]
                break
        subst, i = FontnameTools.find_in_dicts(
            name_rest, [FontnameTools.known_weights2, FontnameTools.known_widths]
        )
        form = FontnameTools.get_shorten_form_idx(aggressive, name_pre, i)
        if isinstance(subst, tuple):
            return name_pre + subst[form]
        if not len(name_pre):
            # The following sets do not allow modifiers
            subst, _ = FontnameTools.find_in_dicts(
                name_rest, [FontnameTools.known_weights1, FontnameTools.known_slopes]
            )
            if isinstance(subst, tuple):
                return subst[form]
        return name

    @staticmethod
    def short_styles(lists, aggressive):
        """Shorten all style names in a list or a list of lists"""
        if not len(lists) or not isinstance(lists[0], list):
            return list(
                map(lambda x: FontnameTools.shorten_style_name(x, aggressive), lists)
            )
        return [
            list(map(lambda x: FontnameTools.shorten_style_name(x, aggressive), styles))
            for styles in lists
        ]

    @staticmethod
    def make_oblique_style(weights, styles):
        """Move "Oblique" from weights to styles for font naming purposes"""
        if "Oblique" in weights:
            weights = list(weights)
            weights.remove("Oblique")
            styles = list(styles)
            styles.append("Oblique")
        return (weights, styles)

    @staticmethod
    def get_name_token(name, tokens, allow_regex_token=False):
        """Try to find any case insensitive token from tokens in the name, return tuple with found token-list and rest"""
        # The default mode (allow_regex_token = False) will try to find any verbatim string in the
        # tokens list (case insensitive matching) and give that tokens list item back with
        # unchanged case (i.e. [ 'Bold' ] will match "bold" and return it as [ 'Bold', ]
        # In the regex mode (allow_regex_token = True) it will use the tokens elements as
        # regexes and return the original (i.e. from name) case.
        #
        # Token are always used in a regex and may not capture, use non capturing
        # grouping if needed (?: ... )
        lower_tokens = [t.lower() for t in tokens]
        not_matched = ""
        all_tokens = []
        j = 1
        regex = re.compile("(.*?)(" + "|".join(tokens) + ")(.*)", re.IGNORECASE)
        while j:
            j = regex.match(name)
            if not j:
                break
            if len(j.groups()) != 3:
                sys.exit("Malformed regex in FontnameTools.get_name_token()")
            not_matched += (
                " " + j.groups()[0]
            )  # Blanc prevents unwanted concatenation of unmatched substrings
            tok = j.groups()[1].lower()
            if tok in lower_tokens:
                tok = tokens[lower_tokens.index(tok)]
            tok = FontnameTools.unify_style_names(tok)
            if len(tok):
                all_tokens.append(tok)
            name = j.groups()[2]  # Recurse rest
        not_matched += " " + name
        return (not_matched.strip(), all_tokens)

    @staticmethod
    def postscript_char_filter(name):
        """Filter out characters that are not allowed in Postscript names"""
        # The name string must be restricted to the printable ASCII subset, codes 33 to 126,
        # except for the 10 characters '[', ']', '(', ')', '{', '}', '<', '>', '/', '%'
        out = ""
        for c in name:
            if c in "[](){}<>/%" or ord(c) < 33 or ord(c) > 126:
                continue
            out += c
        return out

    SIL_TABLE = [
        ("(a)nonymous", r"\1nonymice"),
        ("(b)itstream( ?)(v)era( ?sans ?mono)?", r"\1itstrom\2Wera"),
        ("(s)ource", r"\1auce"),
        ("(h)ermit", r"\1urmit"),
        ("(h)asklig", r"\1asklug"),
        ("(s)hare", r"\1hure"),
        ("IBM[- ]?plex", r"Blex"),  # We do not keep the case here
        ("(t)erminus", r"\1erminess"),
        ("(l)iberation", r"\1iteration"),
        ("iA([- ]?)writer", r"iM\1Writing"),
        ("(a)nka/(c)oder", r"\1na\2onder"),
        ("(c)ascadia( ?)(c)ode", r"\1askaydia\2\3ove"),
        ("(c)ascadia( ?)(m)ono", r"\1askaydia\2\3ono"),
        ("(m)( ?)plus", r"\1+"),  # Added this, because they use a plus symbol :->
        ("Gohufont", r"GohuFont"),  # Correct to CamelCase
        # Noone cares that font names starting with a digit are forbidden:
        ("IBM 3270", r"3270"),  # for historical reasons and 'IBM' is a TM or something
        # Some name parts that are too long for us
        ("(.*sans ?m)ono", r"\1"),  # Various SomenameSansMono fonts
        ("(.*code ?lat)in Expanded", r"\1X"),  # for 'M PLUS Code Latin Expanded'
        ("(.*code ?lat)in", r"\1"),  # for 'M PLUS Code Latin'
        ("(b)ig( ?)(b)lue( ?)(t)erminal", r"\1ig\3lue\5erm"),  # Shorten BigBlueTerminal
        ("(.*)437TT", r"\g<1>437"),  # Shorten BigBlueTerminal 437 TT even further
        ("(.*dyslexic ?alt)a", r"\1"),  # Open Dyslexic Alta -> Open Dyslexic Alt
        ("(.*dyslexic ?m)ono", r"\1"),  # Open Dyslexic Mono -> Open Dyslexic M
        ("(overpass ?m)ono", r"\1"),  # Overpass Mono -> Overpass M
        ("(proggyclean) ?tt", r"\1"),  # Remove TT from ProggyClean
        (
            "(terminess) ?\(ttf\)",
            r"\1",
        ),  # Remove TTF from Terminus (after renamed to Terminess)
        ("(im ?writing ?q)uattro", r"\1uat"),  # Rename iM Writing Quattro to Quat
        (
            "(im ?writing ?(mono|duo|quat)) ?s",
            r"\1",
        ),  # Remove S from all iM Writing styles
    ]

    # From https://adobe-type-tools.github.io/font-tech-notes/pdfs/5088.FontNames.pdf
    # The first short variant is from the linked table.
    # The second (longer) short variant is from diverse fonts like Noto.
    # We can
    # - use the long form
    # - use the very short form (first)
    # - use mild short form:
    #   - has no modifier: use the second form
    #   - has modifier: use second form of mod plus first form of weights2
    #   - has modifier: use second form of mod plus second form of widths
    # This is encoded in get_shorten_form_idx()
    known_weights1 = {  # can not take modifiers
        "Medium": ("Md", "Med"),
        "Nord": ("Nd", "Nord"),
        "Book": ("Bk", "Book"),
        "Poster": ("Po", "Poster"),
        "Demi": (
            "Dm",
            "Demi",
        ),  # Demi is sometimes used as a weight, sometimes as a modifier
        "Regular": ("Rg", "Reg"),
        "Display": ("DS", "Disp"),
        "Super": ("Su", "Sup"),
        "Retina": ("Rt", "Ret"),
    }
    known_weights2 = {  # can take modifiers
        "Black": ("Blk", "Black"),
        "Bold": ("Bd", "Bold"),
        "Heavy": ("Hv", "Heavy"),
        "Thin": ("Th", "Thin"),
        "Light": ("Lt", "Light"),
        " ": (),  # Just for CodeClimate :-/
    }
    known_widths = {  # can take modifiers
        "Compressed": ("Cm", "Comp"),
        "Extended": ("Ex", "Extd"),
        "Condensed": ("Cn", "Cond"),
        "Narrow": ("Nr", "Narrow"),
        "Compact": ("Ct", "Compact"),
    }
    known_slopes = {  # can not take modifiers
        "Inclined": ("Ic", "Incl"),
        "Oblique": ("Obl", "Obl"),
        "Italic": ("It", "Italic"),
        "Upright": ("Up", "Uprght"),
        "Kursiv": ("Ks", "Kurs"),
        "Sloped": ("Sl", "Slop"),
    }
    known_modifiers = {
        "Demi": ("Dm", "Dem"),
        "Ultra": ("Ult", "Ult"),
        "Semi": ("Sm", "Sem"),
        "Extra": ("X", "Ext"),
    }

    @staticmethod
    def is_keep_regular(basename):
        """This has been decided by the font designers, we need to mimic that (for comparison purposes)"""
        KEEP_REGULAR = [
            "Agave",
            "Arimo",
            "Aurulent",
            "Cascadia",
            "Cousine",
            "Fantasque",
            "Fira",
            "Overpass",
            "Lilex",
            "Inconsolata$",  # not InconsolataGo
            "IAWriter",
            "Meslo",
            "Monoid",
            "Mononoki",
            "Hack",
            "JetBrains Mono",
            "Noto Sans",
            "Noto Serif",
            "Victor",
        ]
        for kr in KEEP_REGULAR:
            if (basename.rstrip() + "$").startswith(kr):
                return True
        return False

    @staticmethod
    def _parse_simple_font_name(name):
        """Parse a filename that does not follow the 'FontFamilyName-FontStyle' pattern"""
        # No dash in name, maybe we have blanc separated filename?
        if " " in name:
            return FontnameTools.parse_font_name(name.replace(" ", "-"))
        # Do we have a number-name boundary?
        p = re.split("(?<=[0-9])(?=[a-zA-Z])", name)
        if len(p) > 1:
            return FontnameTools.parse_font_name("-".join(p))
        # Or do we have CamelCase?
        n = FontnameTools.camel_explode(name)
        if n != name:
            return FontnameTools.parse_font_name(n.replace(" ", "-"))
        return (False, FontnameTools.camel_casify(name), [], [], [], "")

    @staticmethod
    def parse_font_name(name):
        """Expects a filename following the 'FontFamilyName-FontStyle' pattern and returns ... parts"""
        name = re.sub(
            r"\bsemi-condensed\b", "SemiCondensed", name, 1, re.IGNORECASE
        )  # Just for "3270 Semi-Condensed" :-/
        name = re.sub("[_\s]+", " ", name)
        matches = re.match(r"([^-]+)(?:-(.*))?", name)
        familyname = FontnameTools.camel_casify(matches.group(1))
        style = matches.group(2)

        if not style:
            return FontnameTools._parse_simple_font_name(name)

        # These are the FontStyle keywords we know, in three categories
        # Weights end up as Typographic Family parts ('after the dash')
        # Styles end up as Family parts (for classic grouping of four)
        # Others also end up in Typographic Family ('before the dash')
        weights = (
            [
                m + s
                for s in list(FontnameTools.known_weights2)
                + list(FontnameTools.known_widths)
                for m in list(FontnameTools.known_modifiers) + [""]
                if m != s
            ]
            + list(FontnameTools.known_weights1)
            + list(FontnameTools.known_slopes)
        )
        styles = [
            "Bold",
            "Italic",
            "Regular",
            "Normal",
        ]
        weights = [w for w in weights if w not in styles]
        # Some font specialities:
        other = [
            "-",
            "Book",
            "For",
            "Powerline",
            "Text",  # Plex
            "IIx",  # Profont IIx
            "LGC",  # Inconsolata LGC
            r"\bCE\b",  # ProggycleanTT CE
            r"[12][cmp]n?",  # MPlus
            r"(?:uni-)?1[14]",  # GohuFont uni
        ]

        # Sometimes used abbreviations
        weight_abbrevs = [
            "ob",
            "c",
            "m",
            "l",
        ]
        style_abbrevs = [
            "it",
            "r",
            "b",
            "i",
        ]

        (style, weight_token) = FontnameTools.get_name_token(style, weights)
        (style, style_token) = FontnameTools.get_name_token(style, styles)
        (style, other_token) = FontnameTools.get_name_token(style, other, True)
        if (
            len(style) < 4 and style.lower() != "pro"
        ):  # Prevent 'r' of Pro to be detected as style_abbrev
            (style, weight_token_abbrevs) = FontnameTools.get_name_token(
                style, weight_abbrevs
            )
            (style, style_token_abbrevs) = FontnameTools.get_name_token(
                style, style_abbrevs
            )
            weight_token += weight_token_abbrevs
            style_token += style_token_abbrevs
        while "Regular" in style_token and len(style_token) > 1:
            # Correct situation where "Regular" and something else is given
            style_token.remove("Regular")

        # Recurse to see if unmatched stuff between dashes can belong to familyname
        matches2 = re.match(r"(\w+)-(.*)", style)
        if matches2:
            return FontnameTools.parse_font_name(
                familyname + matches2.group(1) + "-" + matches2.group(2)
            )

        style = re.sub(
            r"(^|\s)\d+(\.\d+)+(\s|$)", r"\1\3", style
        )  # Remove (free standing) version numbers
        style_parts = FontnameTools.drop_empty(style.split(" "))
        style = " ".join(map(FontnameTools.front_upper, style_parts))
        familyname = FontnameTools.camel_explode(familyname)
        return (True, familyname, weight_token, style_token, other_token, style)