forked from kisom/ctrans
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathctrans.py
More file actions
executable file
·327 lines (257 loc) · 10.5 KB
/
ctrans.py
File metadata and controls
executable file
·327 lines (257 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# using Python 2.7!
# translates comments in code
# google translate portions are cleaned up from
# http://www.halotis.com/2009/09/15/google-translate-api-python-script/
# everything else written by Kyle Isom <coder@kyleisom.net>
# usage:
# ./ctrans.py -s filename
# will translate a single file
# Updated 2020-03-02 by William Cable to use Google Cloud Translation
# because the old ajax api wasn't working anymore
# Setup Cloud Console project and download credentials
# https://cloud.google.com/translate/docs/basic/setup-basic
# set path to credentials:
# $env:GOOGLE_APPLICATION_CREDENTIALS="B:\AWI\Python\ctrans\Python Translate-50994b9b6934.json"
import chardet
import codecs
import getopt
import multiprocessing
import os
import re
import sys
import urllib
import simplejson
from google.cloud import translate_v2 as G_translate
translate_client = G_translate.Client()
### globals ###
# variables from halotis' code
lang = 'en' #Target Language
lang_src = '' #Source Language, leave empty for auto-detect
# misc vars
trace = False # enable debugging output
ext = '.en' # extension of translated
# files
num_procs = 32 # number of concurrent
# processes
# coding vars
encodeas = 'utf-8' # input file type
decodeas = 'utf-8' # output file type
cerr = 'strict' # what do with codec errors
autodetect = True # autodetect file encoding
# regexes for the different comment types
scrub_bcomments = re.compile(r'/\*([\s\S]+?)\*/', re.M & re.U)
scrub_lcomments = re.compile(r'//(.+)', re.U & re.M)
scrub_scomments = re.compile(r'#\s*(.+)', re.U & re.M)
# extensions for valid source files
source_exts = { 'c-style':[ 'c', 'cpp', 'cc', 'h', 'hpp' ],
'script': [ 'py', 'pl', 'rb' ] }
def get_splits(text, splitLength = 4500):
"""
Translate Api has a limit on length of text(4500 characters) that can be
translated at once,
"""
return (text[index:index + splitLength]
for index in xrange(0, len(text), splitLength))
def translate(text, target = lang, source = lang_src):
"""
Translate using Googles API
"""
retText = ''
for text in get_splits(text):
if trace: print '[+] translation requested...'
sys.stdout.flush()
resp = translate_client.translate(
text, target_language=target, source_language=source)
try:
retText += resp['translatedText']
except:
retText += text.decode('')
if trace: print '\treceived!'
return retText
### start kyle's code ###
## handle C-style comments
# handles /* \w+ */ comments
def trans_block_comment(comment):
# comment should be arrive as a re.Match object, need to grab the group
trans = unicode(comment.group())
trans = trans.split('\n')
# translate each line and compensate for the fact that gtrans eats your
# formatting
trans = [ translate(line) for line in trans ]
trans = [ line.replace('/ * ', '/* ') for line in trans ]
trans = [ line.replace(' * /', ' */') for line in trans ]
comment = u'\n'.join(trans)
# here's your stupid translation
return comment
# handle // \w+ comments
def trans_line_comment(comment):
trans = unicode(comment.group())
if trace: print trans.encode('utf-8')
trans = trans.lstrip('//')
trans = translate(trans.strip())
comment = u'// %s' % trans
return comment
## handle non-C-style comments
# handle an initial '#', like in perl or python or your mom
def trans_scripting_comment(comment):
trans = unicode(comment.group())
if trans.startswith('#!'): return trans
trans = trans.lstrip('#')
trans = translate(trans.strip())
comment = '# %s' % trans
return comment
### processing code ###
# the following functions handle regexes, file tree walking and file I/O
# guess the encoding on a file
# returns a string with the encoding if it is confident in its guess,
# False otherwise
# detection threshhold is confidence required to return an encoding
#
# design note: returns a string instead of globally modifying the encodeas var
# to support concurrency - the memory of duplicating a short string containing
# the encoding is low enough to not cause a performance hit and prevents the
# code from having to involve locking or shared memory.
def guess_encoding(filename, detection_threshold = 0.8, return_dict = False):
if trace: print '[+] attempting to autodetect coding for %s' % filename
try:
f = open(filename, 'rb')
guess = chardet.detect(f.read())
f.close()
except IOError, e:
if trace: print '[!] error on file %s, skipping...' % filename
print '\t(error returned was %s)' % str(e)
if not return_tuple: return False
confidence = '%0.1f' % guess['confidence']
confidence = float(confidence)
if confidence < detection_threshold:
print '[!] too low of a confidence (%f) to guess coding for %s' % (
guess['confidence'],
filename
)
return False
else:
if trace:
print '[+] detected coding %s for file %s (confidence: %0.2f)' % (
guess['encoding'],
filename,
guess['confidence']
)
return guess['encoding'] if not return_dict else {
'encoding': guess['encoding'],
'confidence': guess['confidence'] }
# attempt to guess dir
def guess_dir(dir):
walk = os.walk(dir)
codes = { }
codec_scan = [ ]
while True:
try:
(dirp, dirs, files) = walk.next()
except StopIteration, e:
break
else:
codec_scan.extend([ os.path.join(dirp, file) for file in files
if is_source(os.path.join(dirp, file))
or is_script(os.path.join(dirp, file)) ])
for file in codec_scan:
guess = guess_encoding(file, return_dict=True)
encoding, confidence = guess['encoding'], guess['confidence']
if encoding in codes:
codes[encoding] += confidence
else:
codes[encoding] = confidence
return list(sorted(codes, key=lambda x: codes[x], reverse=True))[0]
# translate an individual file
def scan_file(filename):
new_filename = filename + ext
# the reason we use a local variable for the encoding based on either
# the guess_encoding() function or a copy of the encodeas global is
# detailed more in the design note in the comments for guess_encoding -
# the tl;dr is it solves some concurrency issues without incurring any
# major penalties.
if autodetect:
encoding = guess_encoding(filename)
if not encoding:
print '[!] could not reliably determine encoding for %s' % filename
print '\taborting!'
return
else:
encoding = encodeas
try:
reader = codecs.open(filename, 'r', # read old source file
encoding=encoding, errors = 'replace')
ucode = reader.read() # untranslated code
writer = codecs.open(new_filename, 'w', # write translated
encoding=decodeas)
reader.close()
except IOError, e: # abort on IO error
print '[!] error on file %s, skipping...' % filename
print '\t(error returned was %s)' % str(e)
return None
if not ucode: return None
if is_source(filename):
tcode = scrub_bcomments.sub(trans_block_comment, ucode)
tcode = scrub_lcomments.sub(trans_line_comment, tcode)
elif is_script(filename):
tcode = scrub_scomments.sub(trans_scripting_comment, ucode)
writer.write(tcode)
print '[+] translated %s to %s...' % (filename, new_filename)
# look through a directory
def scan_dir(dirname):
global autodetect # used to tweak better file encoding
global encodeas # scans
scanner = os.walk(dirname, topdown=True)
pool = multiprocessing.Pool(processes = num_procs)
file_list = []
if autodetect:
encodeas = guess_dir(dirname)
autodetect = False
while True:
try:
scan_t = scanner.next() # scan_t: (dirp, dirs, files)
except StopIteration:
break
else:
for f in scan_t[2]:
file_list.append(os.path.join(scan_t[0], f))
scan_list = [ os.path.join(scan_t[0], file) for file in file_list
if is_source(file) or is_script(file) ]
dev = 1
pool.map(scan_file, scan_list)
pool.close()
pool.join()
# detect c-style comments
def is_source(filename):
extension = re.sub('^.+\\.(\\w+)$', '\\1', filename)
if extension in source_exts['c-style']: return True
return False
# detect script-style comments
def is_script(filename):
extension = re.sub('^.+\\.(\\w+)$', '\\1', filename)
if extension in source_exts['script']: return True
return False
##### start main code #####
if __name__ == '__main__':
(opts, args) = getopt.getopt(sys.argv[1:], 's:d:e:o:t')
dir_mode = False
target = None
for (opt, arg) in opts:
if opt == '-s':
dir_mode = False
target = arg
if opt == '-d':
dir_mode = True
target = arg
if opt == '-e':
if not arg == 'auto':
encodeas = arg
else:
autodetect = True
if opt == '-o':
decodeas = arg
if dir_mode:
scan_dir(target)
else:
scan_file(target)