Source code for pattern_clustering.regexp

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Functions used to craft automata from regular expressions."""

__author__ = "Marc-Olivier Buob"
__maintainer__ = "Marc-Olivier Buob"
__email__ = "marc-olivier.buob@nokia-bell-labs.com"
__copyright__ = "Copyright (C) 2020, Nokia"
__license__ = "BSD-3"

from string import printable
from pybgl.automaton import Automaton, add_edge, set_final
from pybgl.regexp import compile_dfa


[docs]def make_re_hex_digit(lower_case :bool = True, upper_case :bool = True) -> str: """ Builds the regular expression catching hexadecimal values. Args: lower_case (bool): Pass ``False`` to discard lower case values. upper_case (bool): Pass ``False`` to discard upper case values. Returns: The string storing the regular expression. """ return r"[0-9%s%s]" % ( "a-f" if lower_case else "", "A-F" if upper_case else "" )
[docs]def make_re_ipv6(lower_case :bool = True, upper_case :bool = True) -> str: """ Builds the regular expression catching IPv6 addresses. Note this is not an exact match contrary to ``make_re_ipv6_strict``, but the resulting automaton is significantly faster (and should be accurate enough for most of practical use cases). Args: lower_case (bool): Pass ``False`` to discard lower case values. upper_case (bool): Pass ``False`` to discard upper case values. Returns: The string storing the regular expression. """ assert lower_case or upper_case hex4 = "[%s%s0-9]{0,4}" % ( "a-f" if lower_case else "", "A-F" if upper_case else "" ) ipv6_sep = ":" return "((" + hex4 + ")?(" + ipv6_sep + hex4 + ")+" + ipv6_sep + hex4 + ")"
# Avoid to use it (long to compile, long to compute language_density)
[docs]def make_re_ipv6_strict(*args, **kwargs) -> str: """ Builds the regular expression catching IPv6 addresses. Args: \*args: see ``make_re_hex_digit``. \*kwargs: see ``make_re_hex_digit``. Returns: The string storing the regular expression. """ re_seg = make_re_hex_digit(*args, **kwargs) + r"{1,4}" return "(%s)" % "|".join([ "(" + re_seg + ":){7,7}" + re_seg, # 1:2:3:4:5:6:7:8 "(" + re_seg + ":){1,7}:", # 1:: 1:2:3:4:5:6:7:: "(" + re_seg + ":){1,6}:" + re_seg, # 1::8 1:2:3:4:5:6::8 1:2:3:4:5:6::8 "(" + re_seg + ":){1,5}(:" + re_seg + "){1,2}", # 1::7:8 1:2:3:4:5::7:8 1:2:3:4:5::8 "(" + re_seg + ":){1,4}(:" + re_seg + "){1,3}", # 1::6:7:8 1:2:3:4::6:7:8 1:2:3:4::8 "(" + re_seg + ":){1,3}(:" + re_seg + "){1,4}", # 1::5:6:7:8 1:2:3::5:6:7:8 1:2:3::8 "(" + re_seg + ":){1,2}(:" + re_seg + "){1,5}", # 1::4:5:6:7:8 1:2::4:5:6:7:8 1:2::8 re_seg + ":((:" + re_seg + "){1,6})", # 1::3:4:5:6:7:8 1::3:4:5:6:7:8 1::8 ":((:" + re_seg + "){1,7}|:)", # ::2:3:4:5:6:7:8 ::2:3:4:5:6:7:8 ::8 :: # fe80::7:8%eth0 fe80::7:8%1 (link-local IPv6 addresses with zone index) "fe80:(:" + re_seg + "){0,4}%[0-9a-zA-Z]{1,}", # ::255.255.255.255 ::ffff:255.255.255.255 ::ffff:0:255.255.255.255 # (IPv4-mapped IPv6 addresses and IPv4-translated addresses) "::(ffff(:0{1,4}){0,1}:){0,1}" + RE_IPV4, # 2001:db8:3:4::192.0.2.33 64:ff9b::192.0.2.33 (IPv4-Embedded IPv6 Address) "(re_seg:){1,4}:" + RE_IPV4 ])
RE_0_32 = r"(3[0-2]|[0-2]?[0-9])" RE_0_128 = r"(12[0-8]|1[0-1][0-9]|([0-9]{1,2}))" RE_0_255 = r"(25[0-5]|(2[0-4]|[0-1]{0,1}[0-9]){0,1}[0-9])" RE_ALNUM = r"[a-zA-Z0-9]+" RE_ANY = r"(\S|\s)+" # The metacharacter "." is not yet supported in pybgl RE_BOOL = r"0|1" RE_DELIMITER = r"[-+=*@~#]+" RE_SIGN = r"(-|[+])?" RE_UINT = r"[0-9]+" RE_INT = RE_SIGN + RE_UINT RE_GENERAL_INT = RE_SIGN + r"[0-9]{1,3}(,[0-9]{3})*" RE_FLOAT = RE_SIGN + RE_UINT + r"([.]" + RE_UINT + ")?" RE_APPROX_FLOAT = "~?" + RE_FLOAT RE_HEXA = make_re_hex_digit() + r"+" RE_IPV4 = r"((" + RE_0_255 + "[.]){3}" + RE_0_255 + ")" RE_IPV6 = make_re_ipv6() # make_re_ipv6_strict() RE_LETTERS = r"[a-zA-Z]+" RE_NET_IPV4 = "/".join([RE_IPV4, RE_0_32]) RE_NET_IPV6 = "/".join([RE_IPV6, RE_0_128]) RE_PATH = r"(/[-/:._a-zA-Z0-9]+)" RE_SPACES = r"\s+" RE_WORD = r"\S+" MAP_NAME_RE = { "alnum": RE_ALNUM, "any": RE_ANY, "approx_float": RE_APPROX_FLOAT, "bool": RE_BOOL, "delimiter": RE_DELIMITER, "float": RE_FLOAT, "general_int": RE_GENERAL_INT, "hexa": RE_HEXA, "int": RE_INT, "ipv4": RE_IPV4, "ipv6": RE_IPV6, "letters": RE_LETTERS, "net_ipv4": RE_NET_IPV4, "net_ipv6": RE_NET_IPV6, "path": RE_PATH, "uint": RE_UINT, "spaces": RE_SPACES, "word": RE_WORD, }
[docs]def get_pattern_names() -> list: """ Retrieves the list of patterns involved in the default pattern collection. Returns: A list of string, where each string correspond to a pattern name involved in the default pattern collection (``MAP_NAME_RE``). """ return list(MAP_NAME_RE.keys())
[docs]def make_map_name_dfa( map_name_re: dict = None, names :iter = None ) -> dict: """ Builds a dictionary that maps a list of pattern name with the corresponding ``pybgl.Automaton`` instance built according to regular expressions. Args: names (list): A list of string, where each string identifies a pattern names (by default, every keys of ``map_name_re`` is considered). map_name_re (dict): Maps each pattern name (``str``) with the corresponding regular expression (``str``). Defaults to ``None`` for the default pattern collection. """ if map_name_re is None: map_name_re = MAP_NAME_RE if not names: names = list(MAP_NAME_RE.keys()) map_name_dfa = dict() for name in names: try: regex = map_name_re[name] map_name_dfa[name] = compile_dfa(regex) except Exception as e: raise Exception("Error when processing %r: %s" % (name, e)) return map_name_dfa
[docs]def make_dfa_empty() -> Automaton: """ Builds the ``Automaton`` corresponding to the empty language. Returns: The corresponding ``Automaton``. """ dfa_empty = Automaton(1) set_final(0, dfa_empty, False) return dfa_empty
[docs]def make_dfa_any(alphabet: iter = None, separator_alphabet: iter = None) -> Automaton: """ Builds the DFA corresponding to the any non-separator character. Args: alphabet (iter): The characters involved in the alphabet. Default to ``string.printable``). separator (iter): The characters corresponding to separators. Defaults to ``{" ", "\\t", "\\n"}``. Returns: The corresponding ``Automaton``. """ if not alphabet: alphabet = set(printable) if not separator_alphabet: separator_alphabet = {" ", "\t", "\n"} dfa_any = Automaton(2) set_final(1, dfa_any) restricted_alphabet = sorted(set(alphabet) - set(separator_alphabet)) for a in restricted_alphabet: add_edge(0, 1, a, dfa_any) add_edge(1, 1, a, dfa_any) return dfa_any