-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathsenti_client.py
More file actions
291 lines (243 loc) · 9.91 KB
/
senti_client.py
File metadata and controls
291 lines (243 loc) · 9.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
#!/bin/python
'''
Made for python 3.X
assumes sentistrength is available in the current folder,
i.e.: the folder contains 'SentiStrengthCom.jar'.
The language files should be in folders with 2-letter ISO
language codes.
AR = Arabic
CY = Welsh
DE = German
EL = Greek
EN = English
FA = Persian
FR = French
IL = Italian
NL = Dutch
PL = Polish
PT = Portuguese
RU = Russian
SP = Spanish
SW = Swedish
TU = Turkish
Current language sets (except EN) drawn from https://github.com/felipebravom/StaticTwitterSent/tree/master/extra/SentiStrength
Sentistrength java file can be obtained from sentistrength.wlv.ac.uk/ by emailing Professor Thelwall (address on website)
Example use (single-core client):
>>> senti = sentistrength('EN')
>>> res = senti.get_sentiment('I love using sentistrength!')
>>> print(res)
... {'negative': '-1', 'neutral': '1', 'positive': '4'}
Example use (multi-core client):
>>> ms = multisent('EN')
>>> texts = ['This is great!!'] * 10000
>>> res = ms.run_stream(texts)
>>> print(res[0])
... {'negative': '-1', 'neutral': '1', 'positive': '4'}
'''
import logging
import socket
import urllib
import subprocess
import os
import time
from joblib import Parallel, delayed
logging.basicConfig(level='INFO')
logger = logging.getLogger(__file__)
if not 'SentiStrengthCom.jar' in os.listdir('.'):
logger.warning("You need 'SentiStrengthCom.jar' to use this wrapper!")
logger.warning("because this version is not freely available, it was not packaged with this wrapper :-( ")
logger.warning("get it from http://sentistrength.wlv.ac.uk/ by emailing Professor Thelwall")
class sentistrength():
def __init__(self,language, port=8181):
self.language = language
self.sentistrength = ""
self.port = port
def __del__(self):
if self.sentistrength:
os.killpg(self.sentistrength.pid,15)
def run_server(self, language):
if language!=self.language and self.sentistrength:
logger.warning("wrong language running, trying to switch")
os.killpg(self.sentistrength.pid,15)
time.sleep(1)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
sock.connect(('0.0.0.0',self.port))
except ConnectionRefusedError:
try:
logger.info("server not found, trying to launch server")
self.sentistrength = subprocess.Popen(["java -jar SentiStrengthCom.jar sentidata ./%s/ listen 8181 trinary" %language], shell=True, preexec_fn=os.setsid)
time.sleep(1)
sock.connect(('0.0.0.0',self.port))
self.language = language
except:
raise Exception("unable to start server, is there a process already running? ")
return sock
def get_sentiment(self, string_to_code, language="EN"):
url_encoded = urllib.parse.quote(string_to_code)
request_string = "GET /%s HTTP/1.0 \r\n\r\n" %url_encoded
sock = self.run_server(language)
sock.sendall(str.encode(request_string,'UTF-8'))
response = sock.recv(4096)
resp_string = response.decode()
positive, negative, score = resp_string.split()
return {'positive':positive,'negative':negative,'neutral':score}
class multisent():
'''
This is the multicore implementation of the sentistrength wrapper.
Description
---
The multisent object has a specified language. On the first query to
'run_batch' or 'run_stream', the object will create a number of
sentistrengths equal to the specfied number of cores. Incomming texts
will be uniformly divided over these instances. Calls to these systems
are threaded.
Parameters
---
language: str
Should be an ISO two-letter designation for the language, such as
'EN', 'NL' or 'PT'
startport: int[default=8222]
This is the first port used for the sentistrength instances. Ports are
designated range(startport, startport+i) for each instance, where ports
with existing but unassociated sentistrength instances are ignored (!)
cores: int[default=-2]
Cores as per joblib notation. -1 = equal to available CPUs, lower is
CPU - cores, i.e. -3 on a 4 CPU system is 2.
batchsize: int[default=10000]
When using the run_stream method, the stream is actually cut into
batches of this size. This is to reduce overhead of the joblib call
while still enabeling bigger-than memory data streams.
Examples
---
>>> ms = multisent('EN')
>>> texts = ['This is great!!'] * 10000
>>> res = ms.run_stream(texts)
>>> print(res[0])
{'negative': '-1', 'neutral': '1', 'positive': '4'}
'''
def __init__(self, language, startport=8222, cores=-2, batchsize=1000):
self.language = language
self.cores = cores
self.instances = []
self.status = "initialized"
self.startport = startport
self.batchsize = batchsize
def __del__(self):
self.stop_all()
def _top_port(self):
return max([instance['port'] for instance in self.instances]+[self.startport-1])
def get_status(self):
no_instances = len(self.instances)
if not no_instances:
if self.status!='initialized':
self.status='stopped'
else:
self.status = 'Running {no_instances} instances'.format(**locals())
return self.status
def check_instances(self):
if not self.instances:
print('No instances to check')
for instance in self.instances:
port = instance.get('port','UNKNOWN')
pid = instance.get('pid','UNKNOWN')
works = check_exists(instance['port']) and "WORKS" or "FAILED"
print("Instance {pid:5} at port {port:5} status {works:8}".format(**locals()))
def start_server(self, port=None, attempts=5):
if not port:
port = self._top_port()+1
if check_exists(port):
logger.info("server at {port} already exists!".format(**locals()))
self.start_server(port+1)
return
instance = subprocess.Popen(["java -jar SentiStrengthCom.jar sentidata ./%s/ listen %s trinary" %(self.language,port)],
shell=True, preexec_fn=os.setsid)
while not check_exists(port):
time.sleep(1)
attempts -= 1
if not attempts:
logger.warn('failed to start {language} server at port {port}'.format(**locals()))
return False
instance = {'instance':instance, 'pid':instance.pid, 'language':self.language,'port':port}
logger.info("started instance {pid} at port {port}".format(**instance))
self.instances.append(instance)
return True
def stop_server(self, port=None,pid=None):
if port and pid:
logger.warn("this function requires EITHER a port OR a pid, ignores pid if both")
if port:
instance = [instance for instance in self.instances if instance['port']==port]
elif pid:
instance = [instance for instance in self.instances if instance['pid']==pid]
else:
instance = self.instances
if not instance:
logger.warn("Instance not found!")
return False
instance = instance[0]
os.killpg(instance['instance'].pid, 15)
time.sleep(1)
if not check_exists(instance['port']):
logger.info('Stopped {pid} instance at port {port}'.format(**instance))
self.instances.remove(instance)
return True
else:
logger.warn('Unable to stop {pid} instance running at {port}!!'.format(**instance))
return False
def _loop_over(self, looped_iterable, fixed_iterable):
iterator = 0
for item in fixed_iterable:
if iterator==len(looped_iterable):
iterator=0
yield looped_iterable[iterator], item
iterator +=1
def _batch_up(self, iterable):
batch = []
for num, item in enumerate(iterable):
batch.append(item)
if not (num+1) % self.batchsize :
yield batch
batch = []
if batch: yield batch
def start_all(self):
if self.cores < 0:
no_servers = os.cpu_count() + (self.cores+1)
else:
no_servers = self.cores
logger.info('Starting {no_servers} servers in {self.language}'.format(**locals()))
for i in range(no_servers):
self.start_server()
self.get_status()
def stop_all(self):
while self.instances:
instance = self.instances[0]
self.stop_server(pid=instance['pid'])
def run_batch(self, texts):
if not self.instances:
logger.info('No servers found, starting servers')
self.start_all()
ports = [instance['port'] for instance in self.instances]
return Parallel(n_jobs=min(self.cores,len(ports)), backend='threading')(delayed(query_instance)(port,text) for port,text in self._loop_over(ports, texts))
def run_stream(self, texts):
for batch in self._batch_up(texts):
for item in self.run_batch(batch):
yield item
def query_instance(port, string_to_code):
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
sock.connect(('0.0.0.0',port))
except:
raise Exception("unable to reach server")
url_encoded = urllib.parse.quote(string_to_code)
request_string = "GET /%s HTTP/1.0 \r\n\r\n" %url_encoded
sock.sendall(str.encode(request_string,'UTF-8'))
response = sock.recv(4096)
resp_string = response.decode()
positive, negative, score = resp_string.split()
return {'positive':positive,'negative':negative,'neutral':score}
def check_exists(port):
try:
query_instance(port,'test string')
except:
return False
return True