Source code for arlib.automata.symautomata.cfggenerator

"""This modules generates a string from a CFG"""
import re
import string
import six



[docs] class CNFGenerator(object): """Use Chomsky to transform to CNF""" def __init__(self, grammar_rules): """Parse input grammar rules""" parsed_grammar = [] for grammar_rule in grammar_rules: if isinstance(grammar_rule, six.string_types): rule = grammar_rule else: rule = grammar_rule[0] """Split the rules based on : character""" m = re.compile(":").search(rule) if not m: continue else: if m.start() == 0: return else: left_hand_side = rule[0:m.start()].strip() if m.end() == len(rule): return else: rest = string.strip(rule[m.end():]) if rest == "[]": right_hand_side = [] else: combined_rulesets = string.split(rest, "|") right_hand_side = [] for i in combined_rulesets: l = string.split(i) if len(l) > 1: l = tuple(string.split(i)) else: l = l[0] parsed_grammar.append((left_hand_side, l)) """Assing the parsed rules as the new grammar rules""" self.grammar_rules = parsed_grammar self.grammar_terminals_map = {} self.grammar_nonterminals_map = {} self.new_grammar_nonterminals = {} self.init_symbol = self.grammar_rules[0][0] """The grammar rules can be used to obtain the nonterminals""" self.grammar_nonterminals = {r[0] for r in self.grammar_rules} """The nonterminals can be used to distinguish the terminals""" self.grammar_terminals = set() for rule in self.grammar_rules: """ Each rule can be a symbol ot a set of symbols If rule is a non terminal symbol, add it to the grammar terminals If rule is a set of symbols, add each non terminal symbol to the grammar terminals""" if isinstance(rule[1], six.string_types): if rule[1] not in self.grammar_nonterminals: self.grammar_terminals.add(rule[1]) else: for rule_symbol in rule[1]: if rule_symbol not in self.grammar_nonterminals: self.grammar_terminals.add(rule_symbol) for rule_index in range(len(self.grammar_rules)): if self.grammar_rules[rule_index][0] not in self.grammar_nonterminals_map: self.grammar_nonterminals_map[self.grammar_rules[rule_index][0]] = {rule_index} else: self.grammar_nonterminals_map[self.grammar_rules[rule_index][0]].add(rule_index) """Create a reverse map from symbols to rules for both terminals and nonterminals""" for a in self.grammar_terminals: for rule_index in range(len(self.grammar_rules)): if self.grammar_rules[rule_index][1] == a: if a not in self.grammar_terminals_map: self.grammar_terminals_map[a] = {rule_index} else: self.grammar_terminals_map[a].add(rule_index) for rule_index in range(len(self.grammar_rules)): if self.grammar_rules[rule_index][0] not in self.grammar_nonterminals_map: self.grammar_nonterminals_map[self.grammar_rules[rule_index][0]] = {rule_index} else: self.grammar_nonterminals_map[self.grammar_rules[rule_index][0]].add(rule_index) """Remove all unitary rules""" self.counter = 0 self.unitary = {} f = 1 while f: f = 0 self.unitary = set([rule for rule in self.grammar_rules if (isinstance(rule[1], six.string_types) and rule[1] in self.grammar_nonterminals) or (len(rule[1]) == 1 and rule[1][0] in self.grammar_nonterminals)]) for u in self.unitary: if isinstance(u[1], six.string_types): ui = u[1] else: ui = u[1][0] if ui in self.grammar_nonterminals_map: for rule_index in self.grammar_nonterminals_map[ui]: if (u[0], self.grammar_rules[rule_index][1]) not in self.grammar_rules: f = 1 self.grammar_rules.append((u[0], self.grammar_rules[rule_index][1])) self.grammar_nonterminals_map[u[0]].add(len(self.grammar_rules) - 1) for u in self.unitary: self.grammar_rules.remove(u) for a in self.grammar_terminals: for rule_index in range(len(self.grammar_rules)): if not isinstance(self.grammar_rules[rule_index][1], six.string_types) and len( self.grammar_rules[rule_index][1]) >= 2 and a in self.grammar_rules[rule_index][1]: if a not in self.new_grammar_nonterminals: self.new_grammar_nonterminals[a] = "A@" + a self.grammar_nonterminals.add(self.new_grammar_nonterminals[a]) self.grammar_rules.append((self.new_grammar_nonterminals[a], a)) symbolslist = list(self.grammar_rules[rule_index][1]) for symbol_index in range(len(symbolslist)): if symbolslist[symbol_index] == a: symbolslist[symbol_index] = self.new_grammar_nonterminals[a] self.grammar_rules[rule_index] = (self.grammar_rules[rule_index][0], tuple(symbolslist)) total_rules = len(self.grammar_rules) for rule_index in range(total_rules): if not isinstance(self.grammar_rules[rule_index][1], six.string_types) and len(self.grammar_rules[rule_index][1]) > 2: self.next_rule(self.grammar_rules[rule_index][0], self.grammar_rules[rule_index][1], rule_index)
[docs] def next_rule(self, left_hand_side, right_hand_side, rule_index): if not isinstance(right_hand_side, six.string_types) and len(right_hand_side) == 2: self.grammar_rules[rule_index] = (left_hand_side, right_hand_side) return nta = "A@" + "_" + str(self.counter) self.grammar_nonterminals.add(nta) self.counter += 1 self.grammar_rules.append((left_hand_side, (right_hand_side[0], nta))) self.next_rule(nta, right_hand_side[1:], rule_index)
[docs] class CFGGenerator(object): """This class generates a string from a CFG""" grammar = None resolved = None bfs_queue = None maxstate = 0 def __init__(self, cfgr=None, optimized=1, splitstring=0, maxstate=0): """ Object initialization Args: cfgr (CNF): grammar for the random objects optimized (bool): mode of operation - if enabled not all CNF rules are included (mitigate O(n^3)) splitstring (bool): A boolean for enabling or disabling the splitting of symbols using a space maxstate (int): The maxstate is used for generating in a dynamic way the CNF rules that were not included due to the optimization. As a result, the algorithm generates these rules only if required. Returns: None """ self.grammar = None self.resolved = None self.bfs_queue = None self.maxstate = 0 self.grammar = cfgr self.optimized = optimized self.splitstring = splitstring # Because of the optimization, there are some non # existing terminals on the generated list if self.optimized: self._clean_terminals() self.resolved = {} self.bfs_queue = [] self.maxstate = maxstate
[docs] def generate(self): """ Generates a new random string from the start symbol Args: None Returns: str: The generated string """ result = self._gen(self.optimized, self.splitstring) if self.splitstring and result is not None: result = result[1:] return result
def _clean_terminals(self): """ Because of the optimization, there are some non existing terminals on the generated list. Remove them by checking for terms in form Ax,x """ new_terminals = [] for term in self.grammar.grammar_terminals: x_term = term.rfind('@') y_term = term.rfind('A') if y_term > x_term: x_term = y_term ids = term[x_term + 1:].split(',') if len(ids) < 2: """It'input_string a normal terminal, not a state""" new_terminals.append(term) self.grammar.grammar_terminals = new_terminals def _check_self_to_empty(self, stateid): """ Because of the optimization, the rule for empty states is missing A check takes place live Args: stateid (int): The state identifier Returns: bool: A true or false response """ x_term = stateid.rfind('@') y_term = stateid.rfind('A') if y_term > x_term: x_term = y_term ids = stateid[x_term + 1:].split(',') if len(ids) < 2: return 0 if ids[0] == ids[1]: # print 'empty' return 1 return 0 def _check_intemediate(self, myntr, maxstate): """ For each state Apq which is a known terminal, this function searches for rules Apr -> Apq Aqr and Arq -> Arp Apq where Aqr is also a known terminal or Arp is also a known terminal. It is mainly used as an optimization in order to avoid the O(n^3) for generating all the Apq -> Apr Arq rules during the PDA to CFG procedure. Args: myntr (str): The examined non terminal that was poped out of the queue maxstate (int): The maxstate is used for generating in a dynamic way the CNF rules that were not included due to the optimization. As a result, the algorithm generates these rules only if required. Returns: bool: Returns true if the algorithm was applied at least one time """ # print 'BFS Dictionary Update - Intermediate' x_term = myntr.rfind('@') y_term = myntr.rfind('A') if y_term > x_term: x_term = y_term ids = myntr[x_term + 1:].split(',') if len(ids) < 2: return 0 i = ids[0] j = ids[1] r = 0 find = 0 while r < maxstate: if r != i and r != j: if 'A' + i + ',' + \ repr(r) not in self.resolved \ and 'A' + j + ',' + repr(r) in self.resolved: self.resolved[ 'A' + i + ',' + repr(r)] = self.resolved[myntr] \ + self.resolved['A' + j + ',' + repr(r)] if self._checkfinal('A' + i + ',' + repr(r)): return self.resolved['A' + i + ',' + repr(r)] if 'A' + i + ',' + repr(r) not in self.bfs_queue: self.bfs_queue.append('A' + i + ',' + repr(r)) find = 1 if 'A' + repr(r) + ',' + j not in self.resolved and 'A' + \ repr(r) + ',' + i in self.resolved: self.resolved[ 'A' + repr(r) + ',' + j] = self.resolved['A' + repr(r) + ',' + i] \ + self.resolved[myntr] if self._checkfinal('A' + repr(r) + ',' + j): return self.resolved['A' + repr(r) + ',' + j] if 'A' + repr(r) + ',' + j not in self.bfs_queue: self.bfs_queue.append('A' + repr(r) + ',' + j) find = 1 r = r + 1 if find == 1: return 1 return 0 def _check_self_replicate(self, myntr): """ For each Rule B -> c where c is a known terminal, this function searches for B occurences in rules with the form A -> B and sets A -> c. """ # print 'BFS Dictionary Update - Self Replicate' find = 0 for nonterm in self.grammar.grammar_nonterminals_map: for i in self.grammar.grammar_nonterminals_map[nonterm]: if self.grammar.grammar_rules[i][0] not in self.resolved and not isinstance( self.grammar.grammar_rules[i][1], (set, tuple)) \ and self.grammar.grammar_rules[i][1] == myntr: self.resolved[self.grammar.grammar_rules[i][0]] = self.resolved[myntr] if self._checkfinal(self.grammar.grammar_rules[i][0]): return self.resolved[self.grammar.grammar_rules[i][0]] if self.grammar.grammar_rules[i][0] not in self.bfs_queue: self.bfs_queue.append(self.grammar.grammar_rules[i][0]) find = 1 if find == 1: return 1 return 0 def _check_self_nonterminals(self, optimized): """ For each Rule A -> BC where B and C are known terminals (B -> c1 and C -> c2), this function searches replaces A to the corresponding terminals A -> c1c2 """ # print 'BFS Dictionary Update - Self Non Terminals' find = 0 for nt in self.grammar.grammar_nonterminals_map: for i in self.grammar.grammar_nonterminals_map[nt]: if (self.grammar.grammar_rules[i][0] not in self.resolved\ or self.grammar.grammar_rules[i][0] == 'S') \ and isinstance(self.grammar.grammar_rules[i][1], (set, tuple)): # All rules are in CNF form, so first check the A -> BC rules part_a = None if optimized and self._check_self_to_empty( self.grammar.grammar_rules[i][1][0]): part_a = '' elif self.grammar.grammar_rules[i][1][0] in self.resolved: part_a = self.resolved[self.grammar.grammar_rules[i][1][0]] part_b = None if optimized and self._check_self_to_empty( self.grammar.grammar_rules[i][1][1]): part_b = '' elif self.grammar.grammar_rules[i][1][1] in self.resolved: part_b = self.resolved[self.grammar.grammar_rules[i][1][1]] if part_a is not None and part_b is not None: self.resolved[self.grammar.grammar_rules[i][0]] = part_a + part_b # print 'Non Terminals Resolving # '+self.g.Rules[i][0]+": "+ # self.Resolved[self.g.Rules[i][0]] if self._checkfinal(self.grammar.grammar_rules[i][0]): return self.resolved[self.grammar.grammar_rules[i][0]] if self.grammar.grammar_rules[i][0] not in self.bfs_queue: self.bfs_queue.append(self.grammar.grammar_rules[i][0]) find = 1 if find == 1: return 1 return 0 def _checkfinal(self, nonterminal): if nonterminal == 'S': return 1 return 0 def _gen(self, optimized, splitstring): """Generates a new random object generated from the nonterminal Args: optimized (bool): mode of operation - if enabled not all CNF rules are included (mitigate O(n^3)) splitstring (bool): A boolean for enabling or disabling Returns: str: The generated string """ # Define Dictionary that holds resolved rules # (only in form A -> terminals sequence) self.resolved = {} # First update Resolved dictionary by adding rules # that contain only terminals (resolved rules) for nt in self.grammar.grammar_nonterminals_map: for i in self.grammar.grammar_nonterminals_map[nt]: if self.grammar.grammar_rules[i][0] not in self.resolved\ and not isinstance(self.grammar.grammar_rules[i][1], (set, tuple)): if self.grammar.grammar_rules[i][1] != '@empty_set' \ and self.grammar.grammar_rules[i][1] in self.grammar.grammar_terminals: if splitstring: self.resolved[ self.grammar.grammar_rules[i][0]] = self.grammar.grammar_rules[i][1] else: if self.grammar.grammar_rules[i][1] == '&': self.resolved[self.grammar.grammar_rules[i][0]] = ' ' else: self.resolved[ self.grammar.grammar_rules[i][0]] = self.grammar.grammar_rules[i][1] # print 'ResolvingA '+self.g.Rules[i][0]+": "+ # self.g.Rules[i][1] if self._checkfinal(self.grammar.grammar_rules[i][0]): return self.resolved[self.grammar.grammar_rules[i][0]] if self.grammar.grammar_rules[i][0] not in self.bfs_queue: self.bfs_queue.append(self.grammar.grammar_rules[i][0]) if self.grammar.grammar_rules[i][1] == '@empty_set': self.resolved[self.grammar.grammar_rules[i][0]] = '' # print 'ResolvingB '+self.g.Rules[i][0]+": " self.bfs_queue.append(self.grammar.grammar_rules[i][0]) if optimized and self._check_self_to_empty( self.grammar.grammar_rules[i][1]): self.resolved[self.grammar.grammar_rules[i][0]] = '' # print 'ResolvingC '+self.g.Rules[i][0]+": " if self.grammar.grammar_rules[i][0] not in self.bfs_queue: self.bfs_queue.append(self.grammar.grammar_rules[i][0]) # Then try to use the rules from Resolved dictionary and check # if there is another rule that can be resolved. # This should be done in a while loop change = 1 while change: change = 0 if not change: ret = self._check_self_nonterminals(optimized) if ret == 1: change = 1 elif ret != 0: return ret if not change: while not change and len(self.bfs_queue) > 0: myntr = self.bfs_queue.pop() ret = self._check_self_replicate(myntr) if ret == 1: change = 1 elif ret != 0: return ret if optimized and self._check_intemediate( myntr, self.maxstate): change = 1 break