-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocessGovdocsDataset.R
More file actions
50 lines (37 loc) · 1.56 KB
/
processGovdocsDataset.R
File metadata and controls
50 lines (37 loc) · 1.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# script for processing govdocs metadata and
# saving that metadata into a data frame for easier
# future processing
# this script on bigger number of files is not efficient as
# it uses rbind function
source('utils.R')
folderIN <- "/Users/kresimir/Dropbox/Work/Projects/BenchmarkDP/publications/INFSOF/experiments/real world dataset/vbsproperties2/"
listFiles <- list.files(folderIN)
fileMetadata <- data.frame(fileName=character(), numPage=character(), numCh=character(), numWords=character(),
numLines=character(), numParag=character(), numTables=character(), numWordTable=character(), numParagTable=character())
tUZFolder<- "temp"
dir.create(tUZFolder)
numFiles <- length(listFiles)
count <- 0
for (f in listFiles) {
if (count %% 10 == 0)
print (paste("Processed ", count, "/", numFiles, sep=""))
count <- count + 1
filNameList <- unlist(strsplit(f, ".", fixed=TRUE))
name <- filNameList[1]
extension <- filNameList[2]
fileResult <- paste(folderIN, f, sep="")
unzip(fileResult, exdir=tUZFolder)
tempFolder <- paste(tUZFolder, "/", name, sep="")
tmp <- readVBSMetadata(tempFolder)
fileMetadata <- rbind(fileMetadata, tmp)
# if (count==1) {
# break
# }
}
print(paste("Processed ", count, " files", sep=""))
#
fileMetadata<- fileMetadata[complete.cases(fileMetadata),]
fileOut <- "/Users/kresimir/Dropbox/Work/Projects/BenchmarkDP/publications/INFSOF/experiments/real world dataset/metadata.tsv"
write.table(fileMetadata, fileOut, sep="\t", col.names = TRUE, row.names = FALSE)
#
unlink(tUZFolder, recursive=TRUE)