-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathmake_big_corpus.sh
More file actions
executable file
·101 lines (87 loc) · 4.21 KB
/
make_big_corpus.sh
File metadata and controls
executable file
·101 lines (87 loc) · 4.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/bin/bash
# this script takes as input, for $corpus, the folder containing the daylong recordings
# and the transcriptions in eaf format in a folder raw_$corpus. It reads the transcriptions
# in the eaf files and cuts the wav to only keep the transcribed part, and convert the transcribed
# part into rttm with the same names as the cut wavs.
# take the name of the corpus as input and define the hard coded paths - they shall not change!
corpus=$1 # should be chosen between BER, WAR, SOD and CAS
working_dir=/DATA2T/aclew_data_update
daylong_dir=/DATA2T/aclew_daylong_data
temp_dir=$working_dir/temp # create temp dir to store wav. will be removed at end of script
# Define function to check if all small wavs have an rttm. If not, create an empty one.
create_empty_rttm() {
# check the wav files and the talker_roles to see if the rttm exists
# if it doesn't, create an empty rttm file.
wav=$1
rttms=$2
for fin in `ls $wav/*.wav`; do
basename=$(basename $fin .wav)
if [[ ! -f $rttms/${basename}.rttm ]]; then
touch $rttms/${basename}.rttm
fi
done
}
## Treat BER WAR the "simple way", add parameters to take into account the format of CAS and SOD/SOD2
#if [[ $corpus == 'BER' ]] || [[ $corpus == 'WAR' ]] || [[ $corpus == 'CAS' ]]; then
if [[ $corpus != 'SOD2' ]] ; then
mkdir -p $temp_dir/$corpus/SAD
for wav in `ls $daylong_dir/$corpus/*.wav`; do
base=$(basename $wav .wav)
eaf=raw_$corpus/${base}.eaf
if [[ $corpus == 'CAS' ]]; then
python $working_dir/adjust_timestamps.py $working_dir/github_data/$eaf $wav $temp_dir --CAS
else
python $working_dir/adjust_timestamps.py $working_dir/github_data/$eaf $wav $temp_dir 2>&1
fi
# now remove old wav and transcriptions:
done
rm $working_dir/ACLEW_data/databrary_ACLEW/wavs/${corpus}_*wav $working_dir/ACLEW_data/databrary_ACLEW/talker_role/${corpus}_*rttm
mv $temp_dir/*.rttm $working_dir/ACLEW_data/databrary_ACLEW/talker_role/
mv $temp_dir/*.wav $working_dir/ACLEW_data/databrary_ACLEW/wavs/
# create empty rttms when the wav has no transcription (nobody's talking)'
#create_empty_rttm $corpus/treated $corpus/treated/talker_role
# Now create the SAD
rm $working_dir/ACLEW_data/databrary_ACLEW/SAD/${corpus}_*rttm
python remove_overlap_rttm.py $working_dir/ACLEW_data/databrary_ACLEW/talker_role/ $working_dir/ACLEW_data/databrary_ACLEW/SAD
fi
# SOD : the eaf have the same name as the wav, but they added '-$name' where $name is the initials of the annotator
if [[ $corpus == 'SOD2' ]]; then
echo "treating SOD" 2>&1
corpus=SOD
mkdir -p $temp_dir/$corpus/SAD
for wav in `ls $daylong_dir/$corpus/*.wav`; do
base=$(basename $wav .wav)
eaf=raw_$corpus/${base}-TS.eaf
if [ ! -f $working_dir/github_data/$eaf ]; then
eaf=raw_$corpus/${base}-JK.eaf
fi
python $working_dir/adjust_timestamps.py $working_dir/github_data/$eaf $wav $temp_dir 2>&1
done
mv $temp_dir/*.rttm $working_dir/ACLEW_data/databrary_ACLEW/talker_role/
mv $temp_dir/*.wav $working_dir/ACLEW_data/databrary_ACLEW/wavs/
# Now create the SAD
python remove_overlap_rttm.py $working_dir/ACLEW_data/databrary_ACLEW/talker_role/ $working_dir/ACLEW_data/databrary_ACLEW/SAD
fi
# create empty rttms when the wav has no transcription (nobody's talking)'
#create_empty_rttm $corpus/treated $corpus/treated/talker_role
# Now create the SAD
#python remove_overlap_rttm.py $corpus/treated/talker_role $corpus/treated/SAD
## CAS : the eaf don't have the 'on_off' tier to encode the segments, instead it's called "code" - this is taken
## into account in adjust_timestamps.py with the --CAS parameter
#if [[ $corpus == 'CAS' ]]; then
# #convert2wav $corpus
# mkdir -p $corpus/treated/SAD
# for wav in `ls $corpus/*.wav`; do
# base=$(basename $wav .wav)
# eaf=$corpus/raw_$corpus/${base}.eaf
# python $working_dir/adjust_timestamps.py $eaf $wav --CAS
# done
#fi
## create empty rttms when the wav has no transcription (nobody's talking)'
#
#create_empty_rttm $corpus/treated $corpus/treated/talker_role
#
## Now create the SAD
#python remove_overlap_rttm.py $corpus/treated/talker_role $corpus/treated/SAD
##done
#