-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnotebook
More file actions
82 lines (64 loc) · 2.8 KB
/
notebook
File metadata and controls
82 lines (64 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import pymorphy3
import numpy as np
import matplotlib.pyplot as plt
morph = pymorphy3.MorphAnalyzer()
def read_poem(path):
with open(path, 'r', encoding='utf-8') as f:
return [line.rstrip() for line in f if line.strip()]
lines = read_poem('/content/ЖДИ_МЕНЯ.txt')
def get_last_word(line):
words = re.findall(r'[а-яА-ЯёЁ]+', line)
return words[-1].lower() if words else None
last_words = [get_last_word(line) for line in lines if get_last_word(line)]
results = []
for i, word in enumerate(last_words):
parses = morph.parse(word)
best = parses[0]
score = best.score # число от ~1 до ~1e-10
surprisal = -np.log10(score + 1e-12)
n_parses = len(parses)
pos_set = set(p.tag.POS for p in parses if p.tag.POS)
is_polysemic = n_parses > 1
is_homonymic = len(pos_set) > 1
results.append({
'line_idx': i + 1,
'word': word,
'score': score,
'surprisal': surprisal,
'n_parses': n_parses,
'is_polysemic': is_polysemic,
'is_homonymic': is_homonymic,
'pos_set': pos_set
})
print(f"{'Строка':<6} {'Слово':<12} {'Score':<10} {'Surprisal':<10} {'Полисемия':<10} {'Омонимия':<10}")
print("-" * 75)
for r in results:
print(f"{r['line_idx']:<6} {r['word']:<12} {r['score']:<10.2e} {r['surprisal']:<10.2f} "
f"{'Да' if r['is_polysemic'] else 'Нет':<10} {'Да' if r['is_homonymic'] else 'Нет':<10}")
words_plot = [r['word'] for r in results]
surprisal_plot = [r['surprisal'] for r in results]
plt.figure(figsize=(14, 5))
plt.bar(words_plot, surprisal_plot, color='darkred')
plt.xticks(rotation=45)
plt.ylabel('Surprisal (–log10(score))')
plt.title('Оценка "неожиданности" последних слов строк (по pymorphy3.score)')
plt.tight_layout()
plt.show()
print("\n Контраст с предыдущим словом (изменение части речи):")
for i, line in enumerate(lines):
words = re.findall(r'[а-яА-ЯёЁ]+', line)
if len(words) < 2:
continue
w1, w2 = words[-2].lower(), words[-1].lower()
p1 = morph.parse(w1)[0]
p2 = morph.parse(w2)[0]
if p1.tag.POS != p2.tag.POS:
print(f"Стр. {i+1}: «{w1} ({p1.tag.POS}) → {w2} ({p2.tag.POS})»")
# Итог
high_surprisal = [r['word'] for r in results if r['surprisal'] > 8]
poly_words = [r['word'] for r in results if r['is_polysemic']]
print(f"\n Выводы:")
print(f"• Высокая 'неожиданность' (surprisal > 8): {high_surprisal}")
print(f"• Полисемичные окончания: {poly_words}")
print("→ Конец строки действительно служит позицией для афористических и семантически насыщенных лексем.")