-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse.py
More file actions
executable file
·79 lines (66 loc) · 2.2 KB
/
parse.py
File metadata and controls
executable file
·79 lines (66 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#! /usr/bin/python
import sys
import re
import urllib2
import string
class entry:
def __init__(self,q="",t="",o="U"):
self.query = q
self.time = t
self.owner = o
def __repr__(self):
return ', '.join([self.query, self.owner, self.time])
def cleanup(text):
exclude = set(string.punctuation)
text = ''.join(ch for ch in text if ch not in exclude)
return text
def extra_format(text):
text = urllib2.unquote(text).replace('+',' ')
text = text.replace(" "," ")
return text
def remove_redundancy1(queries):
collection={}
for i in reversed(range(len(queries))):
if not collection.has_key(queries[i][0][0]):
collection[queries[i][0][0]] = queries[i][0][1]
return collection
#watch that queries with the same string in different times will be
#deleted this is a bug for simplicity ;).
def remove_redundancy2(collection):
collection2={}
for key in collection.keys():
if not collection2.has_key(collection[key]):
collection2[collection[key]] = key
return collection2
def parse_proxy_log(fname):
filp = open(fname,'r')
text = filp.readlines()
filp.close()
queries = lambda l: re.findall("(\d+:\d+:\d+)-.*&q=(.*?)&",l)
tmp = filter(lambda x: x!=[],[queries(l) for l in text])
tmp2 = remove_redundancy2(remove_redundancy1(tmp))
return [entry(extra_format(item),tmp2[item],"U") for item in tmp2]
def parse_tmn_log(fname):
filp = open(fname,'r')
text = filp.readlines()
filp.close()
queries = lambda l: re.findall("query='(.*?)'.*(\d\d:\d\d:\d\d)",l)
res = filter(lambda x: len(x) > 0,[queries(l) for l in text])
return [entry(e[0][0],e[0][1],"T") for e in res]
if __name__=="__main__":
proxy_qs = parse_proxy_log(sys.argv[1])
if len(sys.argv) > 3 and sys.argv[3] == '-d':
print "extracted from the proxy log ----------------"
print proxy_qs
tmn_qs = parse_tmn_log(sys.argv[2])
if len(sys.argv) > 3 and sys.argv[3] == '-d':
print "extracted from TMN log ----------------"
print tmn_qs
for e in tmn_qs:
tmp = filter(lambda x: x.query == e.query or x.time[3:] == e.time[3:], proxy_qs)
for i in tmp:
i.owner = "T"
if len(sys.argv) > 3 and sys.argv[3] == '-d':
print "After merging ............................"
for i in proxy_qs:
print cleanup(i.query)+", "+i.owner+", "+i.time