python/speech to text.py at main · Sodam-Capstone/python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from __future__ import print_function
import time
import json
import boto3
import os.path
import requests
from pydub import AudioSegment
from pydub.utils import mediainfo

# 파일 결합하기
'''
sound1 = AudioSegment.from_wav("filename01.wav")
sound2 = AudioSegment.from_wav("filename02.wav")
combined_sounds = sound1 + sound2
combined_sounds.export("joinedFile.wav", format = "wav")
'''

# 로컬 파일 전처리하기
local_file = "main.wav"
local_file_name = local_file.split(".")[0]
change_file = local_file_name + ".wav"

sound = AudioSegment.from_file(local_file)

if sound.frame_rate != 16000:
    sound = sound.set_frame_rate(16000)

if '.wav' not in local_file:
    sound.export(change_file, format = "wav")
    local_file = change_file

# 로컬 파일 버킷에 업로드하기
s3 = boto3.client('s3')
bucket_file = local_file
bucket_name = 'speech.to.text'

s3.upload_file(local_file, bucket_name, bucket_file)

# 버킷 파일 로컬에 저장하기
s3.download_file(bucket_name, bucket_file, local_file)

### 버킷 파일 텍스트화 하기
transcribe = boto3.client('transcribe')
job_name = bucket_file
job_uri = 's3://speech.to.text/' + job_name

transcribe.start_transcription_job(
    TranscriptionJobName = job_name,
    Media = {'MediaFileUri': job_uri},
    MediaFormat = 'wav',
    LanguageCode = 'ko-KR',
    Settings = {'ShowSpeakerLabels': True, 'MaxSpeakerLabels': 4}
)

while True:
    status = transcribe.get_transcription_job(TranscriptionJobName = job_name)
    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
        break
    print("Not ready yet...")
    time.sleep(10)

### 텍스트화 파일 로컬에 저장하기
url = status['TranscriptionJob']['Transcript']['TranscriptFileUri']
r = requests.get(url, allow_redirects = True)
open('origin.json', 'wb').write(r.content)

with open("origin.json", "r", encoding = "utf-8") as json_file:
  json_data = json.load(json_file)

# start_time, speaker_label, end_time
result = []

for i in range(0, len(json_data['results']['speaker_labels']['segments'])):
  segments = json_data['results']['speaker_labels']['segments'][i]
  del segments['items']

  result.append(segments)

# 단어 단위를 문장 단위로 변경하기
count = 0
content = ""

for i in range(0, len(json_data['results']['items'])):
  # content = word
  try:
    if float(result[count]['end_time']) >= float(json_data['results']['items'][i]['end_time']):
      content += json_data['results']['items'][i]['alternatives'][0]['content'] + " "
      if i == len(json_data['results']['items']) - 1:
        result[count]['result'] = content

    else:
      result[count]['result'] = content
      count += 1
      content = json_data['results']['items'][i]['alternatives'][0]['content'] + " "

  # content != word
  except:
    if i == len(json_data['results']['items']) - 1:
      result[count]['result'] = content

# list to dict
final_result = {}
final_result["result"] = result
final_result["speakers"] = json_data['results']['speaker_labels']['speakers']

with open('data.json', 'w', encoding = "utf-8") as json_file:
  json.dump(final_result, json_file, indent = 4)