Source code for pattern_clustering.pattern_automaton

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# This file is part of the pattern-clustering project.
# https://github.com/nokia/pattern-clustering

"""A ``PatternAutomaton`` represents a string at the pattern level."""

__author__ = "Marc-Olivier Buob, Maxime Raynal"
__maintainer__ = "Marc-Olivier Buob, Maxime Raynal"
__email__ = "marc-olivier.buob@nokia-bell-labs.com, maxime.raynal@nokia.com"
__copyright__ = "Copyright (C) 2022, Nokia"
__license__ = "BSD-3"

from functools import partial
from pybgl.automaton import *
from pybgl.deterministic_inclusion import deterministic_inclusion
from pybgl.dijkstra_shortest_paths import dijkstra_shortest_path, make_path
from pybgl.property_map import make_assoc_property_map, make_func_property_map
from .multi_grep import *

# PatternAutomaton may possibly drop some arcs, e.g. "spaces" arcs
[docs]class PatternAutomaton(Automaton): """ A ``PatternAutomaton`` models a string at the pattern level using a automaton-like structure where each vertex corresponds to a string index; each arc corresponds to an infix and its corresponding pattern. """ def __init__( self, word: str, map_name_dfa: dict, make_mg: callable = None, filtered_patterns :set = None ): """ Constructs the ``PatternAutomaton`` related to an input word according to a collection of patterns and according to a ``multi_grep`` strategy. Args: word (str): The input string. map_name_dfa (dict): The pattern collection mapping each pattern name (``str``) with its corresponding ``Automaton`` instance. The ``"any"`` pattern is always ignored. filtered_patterns (set): A subset (possibly empty) of ``map_name_dfa.keys()`` keying the types that must be caught my ``multi_grep``, but not appearing in the arcs involved in the ``PatternAutomaton``. It may be used for instance to drop spaces and get a smaller ``PatternAutomaton``, but the position of spaces in the original lines will be lost. """ if filtered_patterns is None: filtered_patterns = set() if not make_mg: make_mg = MultiGrepFunctorLargest _map_name_dfa = {k : v for (k, v) in map_name_dfa.items() if k != "any"} mg = make_mg() # Add vertices n = len(word) super().__init__(n + 1) set_final(n, self) self.w = word # Add edges multi_grep(word, map_name_dfa, mg) if mg.indices(): vertices_with_successors = set() vertices_with_predecessors = set() for (name, jks) in mg.indices().items(): if name in filtered_patterns: continue for (j, k) in jks: add_edge(j, k, name, self) vertices_with_successors.add(j) vertices_with_predecessors.add(k) to_keep = sorted(vertices_with_successors | vertices_with_predecessors | {0, n}) # Remove isolated vertices for u in range(n): if u not in to_keep: remove_vertex(u, self) # Add missing "any" edges for (i, u) in enumerate(to_keep): if u != 0 and u not in vertices_with_predecessors: add_edge(to_keep[i - 1], u, "any", self) if u != n and u not in vertices_with_successors: add_edge(u, to_keep[i + 1], "any", self) else: # The PatternAutomaton involves a single "any" arc to_remove = set() for u in vertices(self): if u not in {0, n}: to_remove.add(u) for u in to_remove: remove_vertex(u, self) if len(word): add_edge(0, len(word), "any", self)
[docs] def get_slice(self, e :EdgeDescriptor) -> tuple: """ Retrieves the slice (pair of uint indices delimiting a substring) related to an edge. Args: e (EdgeDescriptor): The queried edge identifier. Returns: The slice related to an arbitrary edge of this ``PatternAutomaton`` instance. """ j = source(e, self) k = target(e, self) return (j, k)
[docs] def get_infix(self, e: EdgeDescriptor) -> str: """ Retrieves the infix (substring) related to an edge. Args: e (EdgeDescriptor): The queried edge identifier. Returns: The infix related to an arbitrary edge of this ``PatternAutomaton`` instance. """ (j, k) = self.get_slice(e) return self.w[j:k]
def __eq__(self, pa) -> bool: """ Equality operator. This implementation assumes that the PA is deterministic and minimal, e.g., by using the ``MultiGrepFunctorLargest`` functor in ``PatternAutomaton.__init__``. Args: pa (PatternAutomaton): The ``PatternAutomaton`` instance compared to ``self``. Returns: True iff ``self`` matches ``pa``. """ if num_vertices(self) != num_vertices(pa) or num_edges(self) != num_edges(pa): # MultiGrepFunctorLargest guarantees that two PatternAutomaton can # only be equal if they are of same size, because PatternAutomaton are # always minimal. return False return deterministic_inclusion(self, pa) == 0
[docs]def pattern_automaton_edge_weight( e: EdgeDescriptor, g: PatternAutomaton, map_name_density: dict = None ) -> float: """ Retrieves an edge weight (density). Args: e (EdgeDescriptor): The queried edge identifier. g (PatternAutomaton): The queried PatternAutomaton instance. map_name_density (dict): Maps each pattern name (``str``) with its corresponding density (``float``) Returns: The corresponding density. """ a = label(e, g) return map_name_density.get(a, 1.0) if map_name_density else 1.0
[docs]def pattern_automaton_to_path(g: PatternAutomaton, **kwargs) -> list: """ Computes the path having the lowest density (and hence describing the best pattern-based decomposition) of an input ``PatternAutomaton``. Args: g (PatternAutomaton): The queried ``PatternAutomaton`` instance. Returns: The path minimizing the density. """ s = initial(g) f = finals(g) assert len(f) == 1 t = f.pop() pmap_vpreds = kwargs.pop("pmap_vpreds", None) if pmap_vpreds is None: map_vpreds = defaultdict(set) pmap_vpreds = make_assoc_property_map(map_vpreds) pmap_vdist = kwargs.pop("pmap_vdist", None) if pmap_vdist is None: map_vdist = defaultdict() pmap_vdist = make_assoc_property_map(map_vdist) pmap_eweight = kwargs.pop("pmap_eweight", None) map_name_density = kwargs.pop("map_name_density", None) assert pmap_eweight or map_name_density if pmap_eweight is None: pmap_eweight = make_func_property_map( partial( pattern_automaton_edge_weight, g=g, map_name_density=map_name_density ) ) dijkstra_shortest_path( g, s, t, pmap_eweight, pmap_vpreds, pmap_vdist, **kwargs ) return make_path(g, s, t, pmap_vpreds)