split-speech/test.py at master · jpawz/split-speech · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""
sample.mp3 - speech generated by the ttsmp3.com site from first
four sentences of Zen of Python.
Length of the sample.mp3 is 7628ms.
"""

import os
import pathlib
import unittest
from unittest.mock import MagicMock, patch

from pydub import AudioSegment
from pydub.silence import detect_silence

from split_speech import SoundFile


class TestSplitSpeech(unittest.TestCase):

    # Detected silences: [[1407, 1912], [3397, 3945], [5426, 5876]]
    # all values are in milliseconds
    begining_of_sample = 0
    end_of_sample = 7628
    first_silence_start = 1407
    first_silence_end = 1912
    second_silence_start = 3397
    second_silence_end = 3945
    third_silence_start = 5426
    third_silence_end = 5876
    first_chunk_length = first_silence_start - begining_of_sample
    second_chunk_length = second_silence_start - first_silence_end
    third_chunk_length = third_silence_start - second_silence_end
    fourth_chunk_length = end_of_sample - third_silence_end

    def setUp(self):
        self.sample = SoundFile("./test_data/sample.mp3")
        self.output_file_name = "sample_ext.mp3"
        self.path_to_output_file = pathlib.Path("./" + self.output_file_name)

    def tearDown(self):
        self.path_to_output_file.unlink(missing_ok=True)

    def test_should_detect_three_silences(self):
        """
        Test if finds three silences.
        """
        number_of_silences = 3
        silences = self.sample.detect_silences_manually()

        self.assertEqual(
            len(silences), number_of_silences,
            f"The sample contains four sentences, so there should be {number_of_silences} silences between them."
        )

    def test_get_correct_chunk_length(self):
        """
        Test if gets the correct length of speech (not silence). After the chunk
        will be added some length of silence.
        """
        silences = self.sample.detect_silences_manually()

        chunks = self.sample.generate_speech_chunks()
        first_length = chunks[0][1] - chunks[0][0]
        second_length = chunks[1][1] - chunks[1][0]
        third_length = chunks[2][1] - chunks[2][0]
        fourth_length = chunks[3][1] - chunks[3][0]

        self.assertEqual(self.first_chunk_length, first_length,
                         "Incorrect first chunk length")
        self.assertEqual(self.second_chunk_length, second_length,
                         "Incorrect second chunk length")
        self.assertEqual(self.third_chunk_length, third_length,
                         "Incorrect third chunk length")
        self.assertEqual(self.fourth_chunk_length, fourth_length,
                         "Incorrect fourth chunk length")

    def test_resulting_audio_have_proper_length(self):
        """
        Test if resulting audio have proper length. It means test if it's
        extended by a specified percentage of speech length.
        """
        one_hundred_percent = 100
        two_hundred_percent = 200
        all_chunks_length = self.first_chunk_length + self.second_chunk_length + self.third_chunk_length + self.fourth_chunk_length
        resulting_length_extended_100_percentage = (
            all_chunks_length * one_hundred_percent / 100) + all_chunks_length
        resulting_length_extended_200_percentage = (
            all_chunks_length * two_hundred_percent / 100) + all_chunks_length

        self.sample.detect_silences_manually()
        self.sample.generate_speech_chunks()

        self.sample.extend_silences(one_hundred_percent)
        self.assertEqual(
            len(self.sample.resulting_sound),
            resulting_length_extended_100_percentage,
            r"Resulting audio extended by 100% have incorrect length")

        self.sample.extend_silences(two_hundred_percent)
        self.assertEqual(
            len(self.sample.resulting_sound),
            resulting_length_extended_200_percentage,
            r"Resulting audio extended by 200% have incorrect length")

    def test_exports_resulting_sound(self):
        """
        Test if resulting sound is exported to file.
        """
        self.sample.detect_silences_manually()
        self.sample.generate_speech_chunks()
        self.sample.extend_silences()

        self.sample.write_resulting_file(self.output_file_name)

        self.assertEqual((str(
            self.path_to_output_file), self.path_to_output_file.is_file()),
                         (str(self.path_to_output_file), True),
                         "Can't find exported file.")


class TestSpecialCases(unittest.TestCase):

    def test_ignore_too_short_speech(self):
        """
        Too short speech should not be splitted. The sample file
        contains four sentences but one is too short so three pieces
        should be detected. The short sentence is 1053ms long.
        """
        sample = SoundFile("./test_data/sample_with_short_sentence.mp3")
        minimum_sentence_length = 1200  # milliseconds
        number_of_sentences_with_minimum_length = 3

        silences = sample.detect_silences_manually()
        speech_pieces = sample.generate_speech_chunks(
            minimum_sentence_length=minimum_sentence_length)

        self.assertEqual(
            len(speech_pieces), number_of_sentences_with_minimum_length,
            f"There should be {number_of_sentences_with_minimum_length} sentences with at least {minimum_sentence_length}ms length detected."
        )

    def test_ignore_leading_silence(self):
        """
        Silence from the beggining and end of the audio should be ignored.
        There are three sentences in the test data, so there are two silences between them.
        """
        sample = SoundFile("./test_data/sample_lead_trail_sil.mp3")

        silences_without_leading_and_trailing = sample.detect_silences_manually(
        )

        self.assertEqual(
            len(silences_without_leading_and_trailing), 2,
            "Without leading and trailing, there are two silences in the sample."
        )

    def test_ignore_too_long_speech_pieces(self):
        """
        Test if too long speeches are not extended. Sample data have two sentences:
        first 5641ms long and second 2997ms long.
        """
        sample = SoundFile("./test_data/sample_with_too_long_sent.mp3")
        first_sentence_length = 5641
        second_sentence_length = 2997
        both_sentences_extended = (first_sentence_length +
                                   second_sentence_length) * 2

        sample.detect_silences_manually()
        sample.generate_speech_chunks()
        sample.extend_silences(maximum_sentence_length=5000)

        self.assertLess(len(sample.resulting_sound), both_sentences_extended)


class TestAutomode(unittest.TestCase):

    @patch("split_speech.AudioSegment")
    def test_get_20_seconds(self, AudioSegment):
        """
        Check if it takes 20s of sample.
        """
        AudioSegment.from_mp3.return_value = list(range(50_000))
        sound_file = SoundFile("mp3_file")

        sample = sound_file.get_20s_from_the_middle()

        self.assertEqual(len(sample), 20_000)

    @patch("split_speech.detect_silence")
    def test_find_threshold_45_in_sample(self, detect_silence):
        """
        Test if find_threshold_in_sample method finds proper threshold level (-45).
        It starts with threshold=-41 and after three steps it should  find
        threshold=-45
        """
        too_less_silences = list(range(1))
        too_much_silences = list(range(20))
        correct_number_of_silences = list(range(10))
        one_more_number_of_silences = list(range(12))
        detect_silence.side_effect = (too_less_silences, too_much_silences,
                                      correct_number_of_silences, one_more_number_of_silences)
        initial_threshold = -41
        some_sample = list(range(20))

        threshold = SoundFile.find_threshold_in_sample(initial_threshold,
                                                       some_sample)

        self.assertEqual(3, detect_silence.call_count,
                         "It should be called 3 times")
        self.assertEqual(-45, threshold,
                         "Detected threshold should equal to -45")


if __name__ == "__main__ ":
    unittest.main()