wait_simonov/notebook at main · ecocity-coder/wait_simonov · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import pymorphy3
import numpy as np
import matplotlib.pyplot as plt


morph = pymorphy3.MorphAnalyzer()

def read_poem(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [line.rstrip() for line in f if line.strip()]

lines = read_poem('/content/ЖДИ_МЕНЯ.txt')

def get_last_word(line):
    words = re.findall(r'[а-яА-ЯёЁ]+', line)
    return words[-1].lower() if words else None

last_words = [get_last_word(line) for line in lines if get_last_word(line)]

results = []
for i, word in enumerate(last_words):
    parses = morph.parse(word)
    best = parses[0]

    score = best.score  # число от ~1 до ~1e-10

    surprisal = -np.log10(score + 1e-12)

    n_parses = len(parses)
    pos_set = set(p.tag.POS for p in parses if p.tag.POS)
    is_polysemic = n_parses > 1
    is_homonymic = len(pos_set) > 1

    results.append({
        'line_idx': i + 1,
        'word': word,
        'score': score,
        'surprisal': surprisal,
        'n_parses': n_parses,
        'is_polysemic': is_polysemic,
        'is_homonymic': is_homonymic,
        'pos_set': pos_set
    })

print(f"{'Строка':<6} {'Слово':<12} {'Score':<10} {'Surprisal':<10} {'Полисемия':<10} {'Омонимия':<10}")
print("-" * 75)
for r in results:
    print(f"{r['line_idx']:<6} {r['word']:<12} {r['score']:<10.2e} {r['surprisal']:<10.2f} "
          f"{'Да' if r['is_polysemic'] else 'Нет':<10} {'Да' if r['is_homonymic'] else 'Нет':<10}")


words_plot = [r['word'] for r in results]
surprisal_plot = [r['surprisal'] for r in results]

plt.figure(figsize=(14, 5))
plt.bar(words_plot, surprisal_plot, color='darkred')
plt.xticks(rotation=45)
plt.ylabel('Surprisal (–log10(score))')
plt.title('Оценка "неожиданности" последних слов строк (по pymorphy3.score)')
plt.tight_layout()
plt.show()

print("\n Контраст с предыдущим словом (изменение части речи):")
for i, line in enumerate(lines):
    words = re.findall(r'[а-яА-ЯёЁ]+', line)
    if len(words) < 2:
        continue
    w1, w2 = words[-2].lower(), words[-1].lower()
    p1 = morph.parse(w1)[0]
    p2 = morph.parse(w2)[0]
    if p1.tag.POS != p2.tag.POS:
        print(f"Стр. {i+1}: «{w1} ({p1.tag.POS}) → {w2} ({p2.tag.POS})»")

# Итог
high_surprisal = [r['word'] for r in results if r['surprisal'] > 8]
poly_words = [r['word'] for r in results if r['is_polysemic']]

print(f"\n Выводы:")
print(f"• Высокая 'неожиданность' (surprisal > 8): {high_surprisal}")
print(f"• Полисемичные окончания: {poly_words}")
print("→ Конец строки действительно служит позицией для афористических и семантически насыщенных лексем.")