Haskell/Chomsky.hs at master · Anabra/Haskell · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
module Chomsky where

import Data.List
import ChomskyTest
import Types


--------------------------------------------------------------------------------GRAMMAR REDUCTION-------------------------------------------------------------------------------------

--should be called by passing the start symbol
getReachables :: Grammar -> [Symbol] -> [Symbol] -> [Symbol]
getReachables _ checkedSet [] = checkedSet
getReachables grammar checkedSet currentSet@(x:xs) = getReachables grammar (x:checkedSet) (xs ++ newNts)
   where
   nts = case foundRule of
                 Just foundRule  -> concat $ snd foundRule
                 Nothing -> [] --if it did not find anything, it will return an empty list
   foundRule = find (\(lhs, _) -> lhs == x) rules   --it merges the right hand sides, but still keeps each Symbol separted (as individual elements in the list)
   newNts = [ nt | nt <- nts, not (nt `elem` currentSet) && not (nt `elem` checkedSet) && nt `elem` nonterminals]
   (start, eps, nonterminals, terminals, rules) = grammar

--Global helper function
isSubsetOf :: Eq a => [a] -> [a] -> Bool
[] `isSubsetOf` other = True
(x:xs) `isSubsetOf` other = x `elem` other && xs `isSubsetOf` other

--should be called with empty list
getProductives :: Grammar -> [Symbol] -> [Symbol]
getProductives grammar currentSet
    | currentSet == newSet = currentSet --this may produce errors in a case where the two lists contain the same items, but in different order (can be solved by mutual containment) /// it probably won't since the map function will always preserve the order of the left-hand sides
    | otherwise = getProductives grammar newSet
    where
    newSet = map fst (filter (\(lhs,rhs) -> any (\x -> x `isSubsetOf` (currentSet ++ terminals)) rhs) rules) --gets the lhs of each rule that has at least one rhs (a subrule) that is a subset of the currentSet and the terminals
    (start, eps, nonterminals, terminals, rules) = grammar

getUsefuls :: Grammar -> [Symbol]
getUsefuls grammar = intersect (getReachables grammar [] [start]) (getProductives grammar [])
    where
    (start, eps, nonterminals, terminals, rules) = grammar

reduceGrammar :: Grammar -> Grammar
reduceGrammar grammar = (start, eps, nonterminals, terminals, newRules)
    where
    newRules = filter (\r -> not $ null $ snd r) (map (filterRhs) filteredRules) --filters those rules that have a useful lhs (whether their filtered rhs is not null)
    filterRhs (lhs,rhs) = (lhs, filter (\x -> x `isSubsetOf` (usefuls ++ terminals)) rhs) --filters the rhs of the rule (whether the subrule only contains symbols that are either useful or terminal)
    filteredRules = filter (\r -> (fst r) `elem` usefuls) rules --filters the rules (whether their lhs is a useful symbol)
    usefuls = getUsefuls grammar
    (start, eps, nonterminals, terminals, rules) = grammar

--------------------------------------------------------------------------------LENGTH REDUCTION---------------------------------------------------------------------------------------

concatRule :: [Symbol] -> Symbol
concatRule [x] = x
concatRule (x:xs) = x ++ "," ++ concatRule xs

--falseNT ["a","A","b","B"] == "{a,A,b,B}" : flaseNT ["A","b","B"]
falseNT :: [Symbol] -> Symbol
falseNT xs = "{" ++ (concatRule xs) ++ "}"

--example call: sliceHelper "S" ["a","A","b","B"] /// this will start slicing the S -> "aAbB" rule
sliceHelper :: Symbol -> [Symbol] -> [Rule]
sliceHelper lhs rhs@(x:xs)
    | length rhs > 2 = (lhs, [[x, flsNT]]) : sliceHelper flsNT xs
    | otherwise = [(lhs, [rhs])]
    where
    flsNT = falseNT xs

--slices a rule that has exactly ONE subrule
subSlice :: Rule -> [Rule]
subSlice rule@(lhs, [rhs]) = sliceHelper lhs rhs


--slices a rule with ANY number of subrules
sliceRule :: Rule -> [Rule]
sliceRule rule@(lhs, rhs) = (initRule : newRuleSet) --this will slice each rule even if they have the same ending /// we have to get rid of duplicates (we will do so in sliceRules)
    where
    initRules = [ head $ subSlice (lhs, [singleSubRule]) | singleSubRule <- rhs ]  --the subSliced rules that have the same lhs of the rule being sliced (the param of this function)
    initRule = (lhs, concatMap (snd) initRules) --this rule will have multiple subrules, including the ones that we got from slicing (but all have the SAME lhs)
    newRuleSet = concat [ tail $ subSlice (lhs, [singleSubRule]) | singleSubRule <- rhs ]

sliceRules :: [Rule] -> [Rule]
sliceRules rules = nubBy (\x y -> fst x == fst y) $ concat [ sliceRule r | r <- rules ] --getting rid of duplicates

reduceLength :: Grammar -> Grammar
reduceLength grammar = (start, eps, newNonTerminals, terminals, newRules)
    where
    newNonTerminals = map fst newRules
    newRules = sliceRules rules
    (start, eps, nonterminals, terminals, rules) = grammar


--------------------------------------------------------------------------------EPSILON---------------------------------------------------------------------------------------

--TODO: generalize
broadenSet :: Grammar -> [Symbol] -> [Symbol]
broadenSet grammar currentSet
    | currentSet == newSet = currentSet --this may produce errors in a case where the two lists contain the same items, but in different order (can be solved by mutual containment) /// it probably won't since the map function will always preserve the order of the left-hand sides
    | otherwise = broadenSet grammar newSet
    where
    newSet = map fst (filter (\(lhs,rhs) -> any (\x -> x `isSubsetOf` (eps : currentSet)) rhs) rules) --gets the lhs of each rule that has at least one rhs (a subrule) that is a subset of the currentSet and the epsilon
    (start, eps, nonterminals, terminals, rules) = grammar

transformHelper :: [Symbol] -> WorD -> [WorD]
transformHelper set subRhs@[a] = [[a]]
transformHelper set subRhs@[a,b]
        | a `elem` set && b `elem` set = subRhs:[[a], [b]]
        | a `elem` set = subRhs:[[b]]
        | b `elem` set = subRhs:[[a]]
        | otherwise = [[a,b]]

transformRule :: [Symbol] -> Rule -> Rule
transformRule set rule@(lhs,rhs) = (lhs, nub $ concatMap (transformHelper set) rhs)


removeEpsilonRules :: Grammar -> Grammar
removeEpsilonRules grammar = (start, eps, nonterminals, terminals \\ [eps], newRules)
    where
    newRules = map (transformRule set) rules
    set = broadenSet grammar []
    (start, eps, nonterminals, terminals, rules) = grammar


--------------------------------------------------------------------------------DECHAINING---------------------------------------------------------------------------------------

isSingleton :: [a] -> Bool
isSingleton [x] = True
isSingleton _ = False;

--broadenChainSet grammar [] ["S"] will get all the nonterminals that you can reach from S on chain
broadenChainSet :: Grammar -> [Symbol] -> [Symbol] -> [Symbol]
broadenChainSet _ checkedSet [] = checkedSet
broadenChainSet grammar checkedSet currentSet@(x:xs) = broadenChainSet grammar (x:checkedSet) (xs ++ newNts)
   where
   chainNts = concat $ filter (\y -> isSingleton y) rhs
   currentRule@(lhs, rhs) = case query of
                            Just query -> query
                            --if it did not find anything, it will return this "default rule"
                            Nothing -> ("", [])
   query = find (\(a,b) -> a == x) rules
   newNts = [ nt | nt <- chainNts, not (nt `elem` currentSet) && not (nt `elem` checkedSet) && nt `elem` nonterminals]
   (start, eps, nonterminals, terminals, rules) = grammar


dechain :: Grammar -> Grammar
dechain grammar = (start, eps, nonterminals, terminals, newRules)
    where
    newRules = [copyChains rule | rule <- rules]
    copyChains r@(lhs, rhs) = (lhs, nub . removeSingles $ rhs ++ (concat [ snd $ chainRule s | s <- chainSet lhs])) --it gets all the NTs that can be reached on chain from the lhs of r, then it searches the corresponding rules, gets the rhs of those, then adds them to the rhs of r and finally removes all subRules that have a singleton && nonterminal rhs
    removeSingles l = filter (\y -> not (isSingleton y) || y `isSubsetOf` terminals) l --removes all elements that are one-length nonterminal lists
    --gets the rule correspondng to the symbol s
    chainRule s = case query of
                  Just query -> query
                  --if it did not find anything, it return this "default" rule
                  Nothing -> ("", [])
                  where
                  query = find (\(a,b) -> a == s) rules
    chainSet sym = broadenChainSet grammar [] [sym]
    (start, eps, nonterminals, terminals, rules) = grammar


--------------------------------------------------------------------------------FALSE NONTERMINALS---------------------------------------------------------------------------------------

--replaceTerminals terminals rhs
replaceTerminals :: [Symbol] -> [Symbol] -> [Symbol]
replaceTerminals terminals rhs
    | length rhs > 1 = [ falseNT_if sym  | sym <- rhs]
    | otherwise = rhs
    where
    falseNT_if s
        | s `elem` terminals = falseNT [s]
        | otherwise = s

--replaceRule terminals rule
replaceRule :: [Symbol] -> Rule -> Rule
replaceRule terminals rule@(lhs,rhs) = (lhs, newRhs)
    where
    newRhs = [ replaceTerminals terminals r | r <- rhs ]

introduceFalseNonTerminals :: Grammar -> Grammar
introduceFalseNonTerminals grammar = (start, eps, newNonTerminals, terminals, newRules)
    where
    newRules = [replaceRule terminals rule | rule <- rules] ++ [ (falseNT [t], [[t]]) | t <- terminals ]
    newNonTerminals = nonterminals ++ [ falseNT [t] | t <- terminals ]
    (start, eps, nonterminals, terminals, rules) = grammar


-------------------------------------------------------------------------------CHOMSKY NORMAL FORM-------------------------------------------------------------------------------------

chomsky :: Grammar -> Grammar
chomsky grammar = introduceFalseNonTerminals $ dechain $ removeEpsilonRules $ reduceLength $ reduceGrammar grammar

checkRule :: [Symbol] -> [Symbol] -> Rule -> Bool
checkRule nonterminals terminals rule@(lhs,rhs)
    | all (\sr -> checkSubRule sr) rhs = True
    | otherwise = False
    where
    checkSubRule subrule
        | length subrule == 2 && all (\x -> x `elem` nonterminals) subrule = True
        | length subrule == 1 && all (\x -> x `elem` terminals) subrule = True
        | otherwise = False

isChomsky :: Grammar -> Bool
isChomsky grammar = noEps && rulesOK
    where
    rulesOK = all (\r -> checkRule nonterminals terminals r) rules
    noEps = not (eps `elem` terminals)
    (start, eps, nonterminals, terminals, rules) = grammar