Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions connecteurs_logiques.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import re
import string

addition = ['et', 'de plus', 'puis', 'en outre', 'non seulement@mais encore']

alternative = ['ou', 'soit@soit', 'soit@ou', 'tantôt@tantôt', 'ou@ou', 'ou bien', 'seulement@mais encore',
"l'un@l'autre", "d'un côté@de l'autre"]

but = ['afin que', 'pour que', 'de peur que', 'en vue de', 'de façon à ce que']

cause = ['car', 'en effet', 'effectivement', 'comme', 'par', 'parce que', 'puisque', 'attendu que', 'vu que',
'étant donné que', 'grâce à', 'par suite de', 'eu égard à', 'en raison de', 'du fait que', 'dans la mesure où',
'sous prétexte que']

comparaison = ['comme', 'de même que', 'ainsi que', 'autant que', 'aussi@que', 'si@que', 'de la même façon que',
'semblablement', 'pareillement', 'plus que', 'moins que', 'non moins que', 'selon que', 'suivant que',
'comme si']

concession = ['malgré', 'en dépit de', 'quoique', 'bien que', 'alors que', 'quelque soit', 'même si',
"ce n'est pas que", 'certes', 'bien sûr', 'évidemment', 'il est vrai que', 'toutefois']

conclusion = ['en conclusion', 'pour conclure', 'en guise de conclusion', 'en somme', 'bref', 'ainsi', 'donc',
'en résumé', 'en un mot', 'par conséquent', 'finalement', 'enfin', 'en définitive']

condition = ['si', 'au cas où', 'à condition que', 'pourvu que', 'à moins que', 'en admettant que', 'pour peu que',
'à supposer que', 'en supposant que', "dans l'hypothèse où", 'dans le cas où', 'probablement',
'sans doute', 'apparemment']

consequence = ['donc', 'aussi', 'partant', 'alors', 'ainsi', 'par conséquent', 'si bien que', "d'où",
'en conséquence', 'conséquemment', 'par suite', "c'est pourquoi", 'de sorte que', 'en sorte que',
'de façon que', 'de manière que', 'si bien que', 'tant et']

classification = ["d'abord", "tout d'abord", 'en premier lieu', 'premièrement', 'en deuxième lieu', 'deuxièmement',
'après', 'ensuite', 'de plus', 'quant à', 'en troisième lieu', 'puis', 'en dernier lieu',
'pour conclure', 'enfin']

explication = ['savoir', 'à savoir', "c'est-à-dire", 'soit']

liaison = ['alors', 'ainsi', 'aussi', "d'ailleurs", 'en fait', 'en effet', 'de surcroît', 'de même', 'également',
'puis', 'ensuite']

opposition = ['mais', 'cependant', 'or', 'en revanche', 'alors que', 'pourtant', 'par contre', 'tandis que',
'néanmoins', 'au contraire', 'pour sa part', "d'un autre côté", 'en dépit de', 'malgré', 'au lieu de']

restriction = ['cependant', 'toutefois', 'néanmoins', 'pourtant', 'mis à part', 'ne@que', 'en dehors de', 'hormis',
'à défaut de', 'excepté', 'sauf', 'uniquement', 'simplement']

temps = ['quand', 'lorsque', 'comme', 'avant que', 'après que', 'alors que', 'dès lors que', 'tandis que',
'depuis que', 'en même temps que', 'pendant que', 'au moment où']

connecteurs = [addition, alternative, but, cause, comparaison, concession, consequence, conclusion, condition,
classification,
explication, liaison, opposition, restriction, temps]
connecteurs_names = ['addition', 'alternative', 'but', 'cause', 'comparaison', 'concession', 'consequence',
'conclusion', 'condition', 'classification', 'explication', 'liaison', 'opposition',
'restriction', 'temps']

doubles_starters = {'non seulement': 'mais encore', 'soit': 'soit', 'tantôt': 'tantôt', 'ou': 'ou',
'seulement': 'mais encore', "l'un": "l'autre", "d'un côté": "de l'autre", 'aussi': 'que',
'si': 'que',
'ne': 'que'}

phrase = "La prolifération de la culture sur brûlis a largement dégradé la forêt ivoirienne alors que le Gabon a plus à craindre de l'ouverture de son couvert forestier à l'exploitation industrielle du bois."


def sentence_to_list(sentence):
wordList = [re.sub('^[{0}]+|[{0}]+$'.format(string.punctuation), '', w) for w in sentence.split()]
lowercase_words = [word.lower() for word in wordList]
return [sentence.lower(), lowercase_words]


sentence = sentence_to_list(phrase)[0]


def test_connecteur(word):
word_categories = []
for category_idx in range(len(connecteurs)):
if word in connecteurs[category_idx]:
word_categories.append(connecteurs_names[category_idx])
return word_categories


dictionary_connecteurs = {}

# Création du dictionnaire
for category_idx in range(len(connecteurs)):

for connecteur in connecteurs[category_idx]:

if connecteur not in dictionary_connecteurs.keys():
dictionary_connecteurs[connecteur] = [connecteurs_names[category_idx]]
else:
dictionary_connecteurs[connecteur].append(connecteurs_names[category_idx])


def detection(str):
found_connectors = {}
for connecteur in dictionary_connecteurs.keys():
if connecteur in str:
if connecteur in sentence_to_list(str)[1] or ' ' in connecteur:
found_connectors[connecteur] = dictionary_connecteurs[connecteur]

for starter in doubles_starters.keys():

if starter == doubles_starters[starter]:
new_str = str.replace(starter, '', 1)

elif starter != doubles_starters[starter]:
new_str = str


if starter in str and doubles_starters[starter] in new_str:
full_connector = starter + '@' + doubles_starters[starter]
found_connectors[full_connector] = dictionary_connecteurs[full_connector]

if 'ou' not in sentence_to_list(str)[1] and "ou@ou" in found_connectors.keys():
del found_connectors["ou@ou"]

if 'si' not in sentence_to_list(str)[1] and "si@que" in found_connectors.keys():
del found_connectors["si@que"]

if 'ne' not in sentence_to_list(str)[1] and "ne@que" in found_connectors.keys():
del found_connectors["ne@que"]

return found_connectors


print(detection(sentence))
# print(dictionary_connecteurs)
105 changes: 86 additions & 19 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,101 @@
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import brown
from linalgo.client import LinalgoClient

token = "e5a30d4fa8b0b52a0513363c5582a214deb255b4"
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Usually, it's dangerous to commit a valid authentication token on GitHub and is considered a security hazard. You could use an ENV variable here.


api_url = 'https://api.linalgo.com/hub'
linalgo_client = LinalgoClient(token=token, api_url=api_url)
task = linalgo_client.get_task('4a2c20e3-64af-4a9f-9fc8-5e703dd7a835')

if __name__ == '__main__':

n = 1000
theta = np.array([0, 0])
x = np.random.uniform(0, 1, (n, 2))
# n = 1000
theta = np.array([1, -0.5])


# x = np.random.uniform(0, 1, (n, 2))
# x[:, 1] = 1
# y = x.dot(theta) + np.random.normal(0, 0.2, n)

# To create a dictionary with the words as keys and the frequencies as definitions
def create_dico():
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mixing French and English in function names could be confusing.

wordlist = brown.words()
# wordlist = opinion_lexicon.words()
dico = {}
for word in wordlist:
word = word.lower()
if word in dico.keys():
dico[word] += 1
else:
dico[word] = 1
total = len(wordlist)
frequencies = {}
for word in dico.keys():
frequencies[word] = dico[word] * 100 / total
return frequencies


# To gather documents and annotations from LinHub
dataset = []
n = len(task.documents)

for word in range(n):
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

naming word an integer can be a bit confusing.

# print(word)
annotations_list = task.documents[word].annotations
if len(annotations_list) != 0:
# print(word,type(annotations_list))
x = task.documents[word].content
y_raw = task.documents[word].annotations[0]
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you know annotation[0] is from the same annotator everytime?

y = task.get_name(str(y_raw))
dataset.append((x, y))
# print(dataset)

# Remplacement des X par les fréquences et des Y par les -1 ou 1
x_formatted = []
y_formatted = []
brown_dict = create_dico()

IK = 'I know this word'
IDK = "I don't know this word"

for couple in dataset:
x, y = couple[0], couple[1]
if x in brown_dict.keys():
x_freq = brown_dict[x]
if y == IK:
y_bool = 1
else:
y_bool = -1
x_formatted.append(x_freq)
y_formatted.append(y_bool)

n_effectif = len(x_formatted)
print(n_effectif)
x = np.zeros((n_effectif, 2))
print(x_formatted)
x[:, 0] = x_formatted
x[:, 1] = 1
y = x.dot(theta) + np.random.normal(0, 0.2, n)
y = y_formatted


def f(x, theta):
return x.dot(theta)


# computes the MRS of y and x
def least_square_error(theta):
return sum((y - x.dot(theta))**2)
return sum((y - x.dot(theta)) ** 2) / n


# computes the derivative of the MRS
def least_square_error_derivative(theta):
return -2 * sum((y - f(x, theta))[:, np.newaxis] * x)
return -2 * sum((y - f(x, theta))[:, np.newaxis] * x) / n

def gradient_descent(theta0, n_iterations=100, step_size=0.01,
precision=0.1, verbose=False):

def gradient_descent(theta0, n_iterations=100, step_size=0.000001,
precision=0, verbose=False):
current_theta = theta0
for i in range(n_iterations):
derivative = least_square_error_derivative(current_theta)
Expand All @@ -33,11 +107,12 @@ def gradient_descent(theta0, n_iterations=100, step_size=0.01,
break
current_theta = next_theta
return current_theta

theta_estimate = gradient_descent(np.array([0, 1]), verbose=True)
print(f'{theta} VS {theta_estimate}')


theta_estimate = gradient_descent(np.array([0.5, 1]), verbose=True)
#print(f'{theta} VS {theta_estimate}')
print(theta_estimate)

# n = 1000
# alpha = 3
# mu = 0
Expand All @@ -51,7 +126,6 @@ def gradient_descent(theta0, n_iterations=100, step_size=0.01,
# if abs(step) <= precision:
# break


# def depsilon(alpha,beta,data_base,data_size):
# sum = 0
# for j in range(data_size) :
Expand All @@ -60,16 +134,13 @@ def gradient_descent(theta0, n_iterations=100, step_size=0.01,
# sum = 2 * sum
# return sum / data_size



# def err_fun(alpha, beta, data_base, data_size):
# sum = 0
# for i in range(data_size):
# iteration = (data_base[i][1]-alpha*data_base[i][0]-beta)**2
# sum += iteration
# return sum


# error = np.random.normal(mu,sigma)
# y = alpha * x + beta + error
# # y_float = alpha * x_values + beta + error
Expand All @@ -87,15 +158,13 @@ def gradient_descent(theta0, n_iterations=100, step_size=0.01,
# precision = 0.000001 # Desired precision of result
# max_iters = 10000 # Maximum number of iterations


# def err_fun(alpha, beta, data_base, data_size):
# sum = 0
# for i in range(data_size):
# iteration = (data_base[i][1]-alpha*data_base[i][0]-beta)**2
# sum += iteration
# return sum


# # Derivative function
# def depsilon(alpha,beta,data_base,data_size):
# sum = 0
Expand All @@ -105,7 +174,6 @@ def gradient_descent(theta0, n_iterations=100, step_size=0.01,
# sum = 2 * sum
# return sum / data_size


# for _ in range(max_iters):
# current_alpha = next_alpha
# next_alpha = current_alpha - gamma * depsilon(current_alpha, beta, data_base, n)
Expand All @@ -119,7 +187,6 @@ def gradient_descent(theta0, n_iterations=100, step_size=0.01,
# for k in alpha_list:
# error_list.append(err_fun(k, beta, data_base, n))


# print("Minimum at ", next_alpha)

# plt.plot(alpha_list,error_list)
Expand Down
Loading