|
| 1 | +require 'pycall' |
| 2 | + |
| 3 | +class Interscript::Compiler::Python < Interscript::Compiler |
| 4 | + def escape(val) |
| 5 | + case val |
| 6 | + when String, Integer |
| 7 | + val.inspect |
| 8 | + when Symbol |
| 9 | + val.to_s.inspect |
| 10 | + when Hash |
| 11 | + "{"+ |
| 12 | + val.map { |k,v| "#{escape k}:#{escape v}" }.join(",")+ |
| 13 | + "}" |
| 14 | + when Array |
| 15 | + "[" + val.map { |i| escape i }.join(",") + "]" |
| 16 | + when nil |
| 17 | + "None" |
| 18 | + else |
| 19 | + pp [:error, val] |
| 20 | + exit! |
| 21 | + end |
| 22 | + end |
| 23 | + |
| 24 | + def re_escape(val) |
| 25 | + @pycall_regex ||= PyCall.import_module("regex") |
| 26 | + @pycall_regex.escape(val) |
| 27 | + end |
| 28 | + |
| 29 | + def new_regexp(str) |
| 30 | + "re.compile(\"#{str}\", re.MULTILINE)" |
| 31 | + end |
| 32 | + |
| 33 | + def indent |
| 34 | + @indent += 4 |
| 35 | + yield |
| 36 | + @indent -= 4 |
| 37 | + end |
| 38 | + |
| 39 | + def emit(code) |
| 40 | + @code << (" " * @indent) << code << "\n" |
| 41 | + end |
| 42 | + |
| 43 | + def compile(map, debug: false) |
| 44 | + @indent = 0 |
| 45 | + @map = map |
| 46 | + @debug = debug |
| 47 | + @parallel_trees = {} |
| 48 | + @parallel_regexps = {} |
| 49 | + @code = "" |
| 50 | + emit "import interscript" |
| 51 | + emit "import regex as re" |
| 52 | + map.dependencies.map(&:full_name).each do |dep| |
| 53 | + emit "interscript.load_map(#{escape dep})" |
| 54 | + end |
| 55 | + |
| 56 | + emit "interscript.stdlib.define_map(#{escape map.name})" |
| 57 | + |
| 58 | + map.aliases.each do |name, value| |
| 59 | + val = compile_item(value.data, map, :str) |
| 60 | + emit "interscript.stdlib.add_map_alias(#{escape map.name}, #{escape name}, #{val})" |
| 61 | + val = "\"" + compile_item(value.data, map, :re) + "\"" |
| 62 | + emit "interscript.stdlib.add_map_alias_re(#{escape map.name}, #{escape name}, #{val})" |
| 63 | + end |
| 64 | + |
| 65 | + map.stages.each do |_, stage| |
| 66 | + compile_rule(stage, @map, true) |
| 67 | + end |
| 68 | + @parallel_trees.each do |k,v| |
| 69 | + emit "_PTREE_#{k} = #{escape v}" |
| 70 | + end |
| 71 | + @parallel_regexps.each do |k,v| |
| 72 | + v = "[\"#{v[0]}\", #{escape v[1]}]" |
| 73 | + emit "_PRE_#{k} = #{v}" |
| 74 | + end |
| 75 | + end |
| 76 | + |
| 77 | + def parallel_regexp_compile(subs_hash) |
| 78 | + # puts subs_hash.inspect |
| 79 | + regexp = subs_hash.each_with_index.map do |p,i| |
| 80 | + "(?P<_%d>%s)" % [i,p[0]] |
| 81 | + end.join("|") |
| 82 | + subs_regexp = regexp |
| 83 | + # puts subs_regexp.inspect |
| 84 | + end |
| 85 | + |
| 86 | + def compile_rule(r, map = @map, wrapper = false) |
| 87 | + return if r.reverse_run == true |
| 88 | + case r |
| 89 | + when Interscript::Node::Stage |
| 90 | + #c += "$map_debug ||= []\n" if @debug |
| 91 | + emit "def _stage_#{r.name}(s):" |
| 92 | + indent do |
| 93 | + r.children.each do |t| |
| 94 | + compile_rule(t, map) |
| 95 | + #c += %{$map_debug << [s.dup, #{@map.name.to_s.inspect}, #{r.name.to_s.inspect}, #{t.inspect.inspect}, #{comp.inspect}]\n} if @debug |
| 96 | + end |
| 97 | + emit "return s\n" |
| 98 | + end |
| 99 | + emit "interscript.stdlib.add_map_stage(#{escape @map.name}, #{escape r.name}, _stage_#{r.name})" |
| 100 | + when Interscript::Node::Group::Parallel |
| 101 | + begin |
| 102 | + # Try to build a tree |
| 103 | + a = [] |
| 104 | + r.children.each do |i| |
| 105 | + raise Interscript::SystemConversionError, "Can't parallelize #{i.class}" unless Interscript::Node::Rule::Sub === i |
| 106 | + raise Interscript::SystemConversionError, "Can't parallelize rules with :before" if i.before |
| 107 | + raise Interscript::SystemConversionError, "Can't parallelize rules with :after" if i.after |
| 108 | + raise Interscript::SystemConversionError, "Can't parallelize rules with :not_before" if i.not_before |
| 109 | + raise Interscript::SystemConversionError, "Can't parallelize rules with :not_after" if i.not_after |
| 110 | + |
| 111 | + next if i.reverse_run == true |
| 112 | + a << [compile_item(i.from, map, :par), compile_item(i.to, map, :parstr)] |
| 113 | + end |
| 114 | + ah = a.hash.abs |
| 115 | + unless @parallel_trees.include? ah |
| 116 | + tree = Interscript::Stdlib.parallel_replace_compile_tree(a) |
| 117 | + @parallel_trees[ah] = tree |
| 118 | + end |
| 119 | + emit "s = interscript.stdlib.parallel_replace_tree(s, _PTREE_#{ah})" |
| 120 | + rescue |
| 121 | + # Otherwise let's build a megaregexp |
| 122 | + a = [] |
| 123 | + Interscript::Stdlib.deterministic_sort_by_max_length(r.children).each do |i| |
| 124 | + raise Interscript::SystemConversionError, "Can't parallelize #{i.class}" unless Interscript::Node::Rule::Sub === i |
| 125 | + |
| 126 | + next if i.reverse_run == true |
| 127 | + a << [build_regexp(i, map), compile_item(i.to, map, :parstr)] |
| 128 | + end |
| 129 | + ah = a.hash.abs |
| 130 | + unless @parallel_regexps.include? ah |
| 131 | + re = parallel_regexp_compile(a) |
| 132 | + @parallel_regexps[ah] = [re, a.map(&:last)] |
| 133 | + end |
| 134 | + emit "s = interscript.stdlib.parallel_regexp_gsub(s, *_PRE_#{ah})" |
| 135 | + end |
| 136 | + when Interscript::Node::Rule::Sub |
| 137 | + from = new_regexp build_regexp(r, map) |
| 138 | + if r.to == :upcase |
| 139 | + to = 'interscript.stdlib.upper' |
| 140 | + elsif r.to == :downcase |
| 141 | + to = 'interscript.stdlib.lower' |
| 142 | + else |
| 143 | + to = compile_item(r.to, map, :str) |
| 144 | + end |
| 145 | + emit "s = #{from}.sub(#{to}, s)" |
| 146 | + when Interscript::Node::Rule::Funcall |
| 147 | + emit "s = interscript.functions.#{r.name}(s, #{escape r.kwargs})" |
| 148 | + when Interscript::Node::Rule::Run |
| 149 | + if r.stage.map |
| 150 | + doc = map.dep_aliases[r.stage.map].document |
| 151 | + stage = doc.imported_stages[r.stage.name] |
| 152 | + else |
| 153 | + stage = map.imported_stages[r.stage.name] |
| 154 | + end |
| 155 | + emit "s = interscript.transliterate(#{escape stage.doc_name}, s, #{escape stage.name})" |
| 156 | + else |
| 157 | + raise Interscript::SystemConversionError, "Can't compile unhandled #{r.class}" |
| 158 | + end |
| 159 | + end |
| 160 | + |
| 161 | + def build_regexp(r, map=@map) |
| 162 | + from = compile_item(r.from, map, :re) |
| 163 | + before = compile_item(r.before, map, :re) if r.before |
| 164 | + after = compile_item(r.after, map, :re) if r.after |
| 165 | + not_before = compile_item(r.not_before, map, :re) if r.not_before |
| 166 | + not_after = compile_item(r.not_after, map, :re) if r.not_after |
| 167 | + |
| 168 | + re = "" |
| 169 | + re += "(?<=#{before})" if before |
| 170 | + re += "(?<!#{not_before})" if not_before |
| 171 | + re += from |
| 172 | + re += "(?!#{not_after})" if not_after |
| 173 | + re += "(?=#{after})" if after |
| 174 | + re |
| 175 | + end |
| 176 | + |
| 177 | + def compile_item i, doc=@map, target=nil |
| 178 | + i = i.first_string if %i[str parstr].include? target |
| 179 | + i = Interscript::Node::Item.try_convert(i) |
| 180 | + if target == :parstr |
| 181 | + parstr = true |
| 182 | + target = :par |
| 183 | + end |
| 184 | + |
| 185 | + out = case i |
| 186 | + when Interscript::Node::Item::Alias |
| 187 | + astr = if i.map |
| 188 | + d = doc.dep_aliases[i.map].document |
| 189 | + a = d.imported_aliases[i.name] |
| 190 | + raise Interscript::SystemConversionError, "Alias #{i.name} of #{i.stage.map} not found" unless a |
| 191 | + "interscript.stdlib.get_alias_ALIASTYPE(#{escape a.doc_name}, #{escape a.name})" |
| 192 | + elsif Interscript::Stdlib::ALIASES.include?(i.name) |
| 193 | + if target != :re && Interscript::Stdlib.re_only_alias?(i.name) |
| 194 | + raise Interscript::SystemConversionError, "Can't use #{i.name} in a #{target} context" |
| 195 | + end |
| 196 | + stdlib_alias = true |
| 197 | + "interscript.stdlib.aliases[#{escape i.name}]" |
| 198 | + else |
| 199 | + a = doc.imported_aliases[i.name] |
| 200 | + raise Interscript::SystemConversionError, "Alias #{i.name} not found" unless a |
| 201 | + |
| 202 | + "interscript.stdlib.get_alias_ALIASTYPE(#{escape a.doc_name}, #{escape a.name})" |
| 203 | + end |
| 204 | + |
| 205 | + if target == :str |
| 206 | + astr = astr.sub("_ALIASTYPE(", "(") |
| 207 | + elsif target == :re |
| 208 | + astr = "\"+#{astr.sub("_ALIASTYPE(", "_re(")}+\"" |
| 209 | + elsif parstr && stdlib_alias |
| 210 | + astr = Interscript::Stdlib::ALIASES[i.name] |
| 211 | + elsif target == :par |
| 212 | + # raise NotImplementedError, "Can't use aliases in parallel mode yet" |
| 213 | + astr = Interscript::Stdlib::ALIASES[i.name] |
| 214 | + end |
| 215 | + when Interscript::Node::Item::String |
| 216 | + if target == :str |
| 217 | + # Replace \1 with \\1, this is weird, but it works! |
| 218 | + i.data.gsub("\\", "\\\\\\\\").inspect |
| 219 | + elsif target == :par |
| 220 | + i.data |
| 221 | + elsif target == :re |
| 222 | + re_escape(i.data) |
| 223 | + end |
| 224 | + when Interscript::Node::Item::Group |
| 225 | + if target == :par |
| 226 | + i.children.map do |j| |
| 227 | + compile_item(j, doc, target) |
| 228 | + end.reduce([""]) do |j,k| |
| 229 | + Array(j).product(Array(k)).map(&:join) |
| 230 | + end |
| 231 | + elsif target == :str |
| 232 | + i.children.map { |j| compile_item(j, doc, target) }.join("+") |
| 233 | + elsif target == :re |
| 234 | + i.children.map { |j| compile_item(j, doc, target) }.join |
| 235 | + end |
| 236 | + when Interscript::Node::Item::CaptureGroup |
| 237 | + if target != :re |
| 238 | + raise Interscript::SystemConversionError, "Can't use a CaptureGroup in a #{target} context" |
| 239 | + end |
| 240 | + "(" + compile_item(i.data, doc, target) + ")" |
| 241 | + when Interscript::Node::Item::Maybe, |
| 242 | + Interscript::Node::Item::MaybeSome, |
| 243 | + Interscript::Node::Item::Some |
| 244 | + |
| 245 | + resuffix = { Interscript::Node::Item::Maybe => "?" , |
| 246 | + Interscript::Node::Item::Some => "+" , |
| 247 | + Interscript::Node::Item::MaybeSome => "*" }[i.class] |
| 248 | + |
| 249 | + if target == :par |
| 250 | + raise Interscript::SystemConversionError, "Can't use a Maybe in a #{target} context" |
| 251 | + end |
| 252 | + if Interscript::Node::Item::String === i.data && i.data.data.length != 1 |
| 253 | + "(?:" + compile_item(i.data, doc, target) + ")" + resuffix |
| 254 | + else |
| 255 | + compile_item(i.data, doc, target) + resuffix |
| 256 | + end |
| 257 | + when Interscript::Node::Item::CaptureRef |
| 258 | + if target == :par |
| 259 | + raise Interscript::SystemConversionError, "Can't use CaptureRef in parallel mode" |
| 260 | + elsif target == :re |
| 261 | + "\\\\#{i.id}" |
| 262 | + elsif target == :str |
| 263 | + "\"\\\\#{i.id}\"" |
| 264 | + end |
| 265 | + when Interscript::Node::Item::Any |
| 266 | + if target == :str |
| 267 | + raise Interscript::SystemConversionError, "Can't use Any in a string context" # A linter could find this! |
| 268 | + elsif target == :par |
| 269 | + i.data.map(&:data) |
| 270 | + elsif target == :re |
| 271 | + case i.value |
| 272 | + when Array |
| 273 | + data = i.data.map { |j| compile_item(j, doc, target) } |
| 274 | + "(?:"+data.join("|")+")" |
| 275 | + when String |
| 276 | + "[#{re_escape(i.value)}]" |
| 277 | + when Range |
| 278 | + "[#{re_escape(i.value.first)}-#{re_escape(i.value.last)}]" |
| 279 | + end |
| 280 | + end |
| 281 | + end |
| 282 | + end |
| 283 | + |
| 284 | + @maps_loaded = {} |
| 285 | + @ctx = nil |
| 286 | + class << self |
| 287 | + attr_accessor :maps_loaded |
| 288 | + attr_accessor :ctx |
| 289 | + end |
| 290 | + |
| 291 | + def load |
| 292 | + if !self.class.maps_loaded[@map.name] |
| 293 | + @map.dependencies.each do |dep| |
| 294 | + dep = dep.full_name |
| 295 | + if !self.class.maps_loaded[dep] |
| 296 | + Interscript.load(dep, compiler: self.class).load |
| 297 | + end |
| 298 | + end |
| 299 | + |
| 300 | + ctx = self.class.ctx |
| 301 | + python_src_path = File.join(__dir__, '..', '..', '..', '..', 'python', 'src') |
| 302 | + unless ctx |
| 303 | + PyCall.sys.path.append(python_src_path) |
| 304 | + self.class.ctx = PyCall.import_module("interscript") |
| 305 | + end |
| 306 | + #puts @code |
| 307 | + File.write("#{python_src_path}/interscript/maps/#{@map.name}.py", @code) |
| 308 | + self.class.ctx.load_map(@map.name) |
| 309 | + |
| 310 | + self.class.maps_loaded[@map.name] = true |
| 311 | + end |
| 312 | + end |
| 313 | + |
| 314 | + def call(str, stage=:main) |
| 315 | + load |
| 316 | + self.class.ctx.transliterate(@map.name, str, stage.to_s) |
| 317 | + end |
| 318 | + |
| 319 | + def self.read_debug_data |
| 320 | + $map_debug || [] |
| 321 | + end |
| 322 | + |
| 323 | + def self.reset_debug_data |
| 324 | + $map_debug = [] |
| 325 | + end |
| 326 | +end |
0 commit comments