-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathCreateSource.scala
More file actions
115 lines (96 loc) · 3.33 KB
/
CreateSource.scala
File metadata and controls
115 lines (96 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package example.large
import example.Settings
import scala.collection.mutable._
import scala.io.Source
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io._
import com.readr.model.annotation.TextAnn
import com.readr.model.annotation.TextFragment
import com.readr.model.annotation.TextFragmentAnn
import com.readr.model.Offsets
import com.readr.model.annotation.Annotations
import com.readr.client.util.AnnotationSequenceFileWriter
object CreateSource extends Settings {
//val headersFile = Settings.srcDir + "/campbell-biology/book-headers.txt"
//val sentencesFile = Settings.srcDir + "/campbell-biology/book-sentences.txt"
//val outDir = Settings.outDir + "/campbell-biology"
def main(args:Array[String]) = {
val headersFile = args(0)
val headersCharset = args(1)
val sentencesFile = args(2)
val sentencesCharset = args(3)
val outDir = args(4)
val conf = new Configuration()
val sfText = new AnnotationSequenceFileWriter(conf, outDir + "/data.col0.TextAnn")
val sfTextFragment = new AnnotationSequenceFileWriter(conf, outDir + "/data.col1.TextFragmentAnn")
val sfSource = new AnnotationSequenceFileWriter(conf, outDir + "/data.col2.Source")
for (clazz <- Annotations.annWithDependentClazzes) {
sfText.register(clazz)
sfTextFragment.register(clazz)
sfSource.register(clazz)
}
val headers = new HashMap[String,String]()
for (line <- Source.fromFile(headersFile, headersCharset).getLines()) {
val t = line.split("\t")
if (t.length < 2)
headers += t(0) -> ""
else
headers += t(0) -> t(1)
}
val fragments = ArrayBuffer[TextFragment]()
var fraStart = 0
var curSec:String = null
var curPar:String = null
val sb = new StringBuilder
var id = 0
for (line <- Source.fromFile(sentencesFile, sentencesCharset).getLines()) {
val t = line.split("\t")
val label = t(0)
val sentence = if (t.length < 2) "" else t(1)
//val sentence = t(1)
val li = label.lastIndexOf(".")
val lj = label.substring(0, li).lastIndexOf(".")
val sec = label.substring(0, lj)
val par = label.substring(lj+1, li)
if (curSec == null || !curSec.equals(sec)) {
if (curSec != null) {
// write section as document to db
var text = sb.toString
fragments += TextFragment("par", Offsets(fraStart, sb.length), true)
text += "\n"
val ta = TextAnn(text)
val tfa = TextFragmentAnn(fragments.toArray)
val so = com.readr.model.annotation.Source("barrons", "", "")
sfText.write(id, ta)
sfTextFragment.write(id, tfa)
sfSource.write(id, so)
id += 1
}
// now create new document
sb.setLength(0)
fragments.clear
val header = headers.getOrElse(sec, "")
sb.append(header)
//sb.append(headers.get(sec).get)
fragments += TextFragment("title", Offsets(0, sb.length), true)
sb.append("\n\n")
fraStart = sb.length
curPar = null
curSec = sec
}
if (curPar == null || !curPar.equals(par)) {
if (curPar != null) {
fragments += TextFragment("par", Offsets(fraStart, sb.length), true)
sb.append("\n\n")
fraStart = sb.length
}
curPar = par
}
sb.append(sentence)
sb.append(" ")
}
sfText.close
sfTextFragment.close
sfSource.close
}
}