UBWebPlotHandler/bp.py at master · marcodeltutto/UBWebPlotHandler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
import config                   #/// User Configuration
import helpers                  #/// Custom helper functions
import log                      #/// Basic console logging

import json                     #/// write/read JSON
import os, subprocess, shutil   #/// Interact with filesystem/os
import sys                      #/// System commands
import re                       #/// RegEx capabilities
from libs import requests       #/// HTTP CRUD operations
import tarfile                  #/// Extract tarballs
import time                     #/// Used for sleep functions
import uuid                     #/// Unique uid generator
import zipfile                  #/// Extract zip files
import collections


#///////////////////////////////////////////////////////////////////////////////
# Class Definitions
class DocDBEntry():
    def __init__(self):
        self.id       = 0
        self.revision = 1
        self.title    = ""
        self.modified = ""
        self.url      = ""
        self.files    = []
        self.authors  = []
        self.topics   = []


#///////////////////////////////////////////////////////////////////////////////
# Basic Initialization
def Init():
    # Show a splash screen
    print ""
    print "/////////////////////////////////////////////////////////////"
    print "//  BLESSED PLOTS BACKEND                                  //"
    print "/////////////////////////////////////////////////////////////"
    print ""

    time.sleep(1)

    # Check for Python config file
    if not (os.path.isfile(config.WEB_PATH + config.JSON_FILENAME)):
        log.error('File not found: ' + config.WEB_PATH + config.JSON_FILENAME + '. Let\'s start fresh.')
        config.REGENERATE = True
        time.sleep(1)

    if not (os.path.isfile(config.BLESSED_PLOTS)):
        log.error('No blessed plots configuration found. I don\'t know which plots go where. Aborting...')
        sys.exit(200)

    return 0


#///////////////////////////////////////////////////////////////////////////////
# Poll DocDB for Document Information
def GetDocumentInfoFromDocDB():
    log.info("Grabbing plots list from DocDB")

    _xml             = helpers.CallDocDB('Search', 'topics', '239')['docdb']['document']
    _plotlistByTopic = helpers.BlessedPlotsList()

    _docIds = []
    for topic in _plotlistByTopic:
        for doc in topic['docs']:
            if doc not in _docIds:
                _docIds.append(doc)

    # Check to see if there are any docs in DocDB that are not listed in file,
    # add them as uncategorized.
    helpers.CombineDocLists(_xml, _docIds)

    log.success('Found %i officially blessed plots on DocDB' % len(_docIds))

    time.sleep(1)

    _entries = []
    idx = 0

    for doc in _docIds:
        idx += 1


        # Get document from DocDB and check that it actually exists
        res = helpers.GetDoc(doc)['docdb']
        if 'document' not in res:
            log.error('[%i/%i] ERROR: doc %i not found in DocDB' % (idx, len(_docIds), doc))
            continue

        item = res['document']

        # Construct the entry
        entry = DocDBEntry()
        entry.id       = int(item['@id'])
        entry.revision = int(item['docrevision']['@version'])
        entry.title    = item['docrevision']['title']
        entry.modified = item['docrevision']['@modified']
        entry.url      = item['docrevision']['@href']

        log.info('[%i/%i] Fetching metadata for doc %i v%i -- %s' % (idx, len(_docIds), entry.id, entry.revision, entry.title))

        # Build author list (data structure is different if
        # there are multiple authors)
        if type(item['docrevision']['author']) is list:
            for author in item['docrevision']['author']:
                entry.authors.append(
                    {
                        'firstname' : author['firstname'],
                        'lastname'  : author['lastname'],
                        'id'        : int(author['@id'])
                    }
                )

        else:
            author = item['docrevision']['author']
            entry.authors.append(
                {
                    'firstname' : author['firstname'],
                    'lastname'  : author['lastname'],
                    'id'        : int(author['@id'])
                }
            )


        # Get list of info for this entry
        docXml = helpers.CallDocDB('ShowDocument', 'docid', item['@id'])

        docID = docXml['docdb']['document']['docrevision']['@docid']
        rev   = docXml['docdb']['document']['docrevision']['@version']

        # Get topics for this doc
        entry.topics = helpers.GetTopicsByDocID(doc)

        _entries.append(entry.__dict__)

    return _entries


#///////////////////////////////////////////////////////////////////////////////
# Get Document Information from JSON File
def GetDocumentInfoFromDisk():
    if config.REGENERATE:
        return []
    else:
        return json.loads(open(config.WEB_PATH + config.JSON_FILENAME).read())


#///////////////////////////////////////////////////////////////////////////////
# Return differences between current list of documents and previous
def FindChanges(new, old):
    # This is where additions, modifications, and deletions will live
    changes = {'additions': [], 'modifications': [], 'deletions': []}

    if config.REGENERATE:
        revisions = {doc['id'] : doc['revision'] for doc in new}

        # Just add everything from DocDB into additions
        for docID in revisions:
            changes['additions'].append(docID)

    else:
        # Build a reduced list of just id/rev numbers as key-value pairs
        newRevs = {entry['id'] : entry['revision'] for entry in new}
        oldRevs = {entry['id'] : entry['revision'] for entry in old}

        # Let's check for some differences
        for docID in newRevs:
            # If the new ID is already contained within the old set...
            if docID in oldRevs:
                # if the revision number of the new is different than the old,
                # then we need to repropcess
                if newRevs[docID] != oldRevs[docID]:
                    changes['modifications'].append(docID)

            # If the new ID is not contained within old set, then we have a new
            # document
            else:
                changes['additions'].append(docID)

        # We also have to check for deletions, so now we look at situations where
        # the new list is missing something that the old list has
        for docID in oldRevs:
            if docID in newRevs:
                continue # Don't need to do anything here...

            # But, if this docID is not included in the new rest, we need tok
            # remove it
            else:
                changes['deletions'].append(docID)

    log.debug('Detected %i additions, %i modifications, and %i deletions' % (len(changes['additions']), len(changes['modifications']), len(changes['deletions'])))

    return set(changes['additions']) | set(changes['modifications'])

#///////////////////////////////////////////////////////////////////////////////
# Fetch all the documents for a list of docdb IDs to local disk
def DownloadFiles(docs):
    tempDir = config.WEB_PATH + config.PLOT_SUBDIR + str(uuid.uuid4()) + '/'
    os.mkdir(tempDir, 0755)

    idx = 0
    for doc in docs:
        idx += 1

        log.info('[%i/%i] Downloading files from docdb %i' % (idx, len(docs), doc))

        tempDocDir = tempDir + str(doc) + '/'
        os.mkdir(tempDocDir, 0755)

        helpers.Download(config.DOCDB_URL + 'RetrieveArchive?docid=%d&type=tar.gz' % doc,
                         tempDocDir+'doc_archive.tar.gz')
        with tarfile.open(tempDocDir+'doc_archive.tar.gz', 'r') as tar:
            oldDir = os.getcwd()
            os.chdir(tempDocDir)
            try:
                tar.extractall()
            finally:
                os.chdir(oldDir)


        for root, directories, filenames in os.walk(tempDocDir):
            for filename in filenames:
                if filename == 'doc_archive.tar.gz': continue

                path = os.path.join(root, filename)

                if filename.endswith('.tar.gz') or filename.endswith('.tar'):
                    log.debug('Extracting '+filename)
                    with tarfile.open(path, 'r') as tar:
                        oldDir = os.getcwd()
                        os.chdir(tempDocDir)
                        try:
                            tar.extractall()
                        finally:
                            os.chdir(oldDir)

                if filename.endswith('.zip'):
                    log.debug('Extracting '+filename)
                    with zipfile.ZipFile(path, 'r') as z:
                        oldDir = os.getcwd()
                        os.chdir(tempDocDir)
                        try:
                            z.extractall()
                        finally:
                            os.chdir(oldDir)

    return tempDir


#///////////////////////////////////////////////////////////////////////////////
# Look through all the downloaded files for captions and images, update documents_curr
def FindFiles(tempdir, documents_curr, documents_to_process, documents_prev):

    files = collections.defaultdict(lambda: {}) # dict from id to dict from base to list of extensions

    # Find all the .txt files
    for root, directories, filenames in os.walk(tempdir):
        for filename in filenames:
            path = os.path.join(root, filename)[len(tempdir):]
            docid = int(path[:path.find('/')])
            if filename.endswith('_caption.txt') and not filename.startswith('.'):
                cap = unicode(file(tempdir+'/'+path, 'r').read(), errors='ignore')
                path = path[len(str(docid))+1:] # drop the docid too for the rest
                base = path[:-12]
                files[docid][base] = {'base': base, 'caption': cap, 'exts': []}
            elif filename.endswith('.txt') and not filename.startswith('.'):
                cap = unicode(file(tempdir+'/'+path, 'r').read(), errors='ignore')
                path = path[len(str(docid))+1:] # drop the docid too for the rest
                base = path[:-4]
                files[docid][base] = {'base': base, 'caption': cap, 'exts': []}

    # Now find matching image files
    for root, directories, filenames in os.walk(tempdir):
        for filename in filenames:
            path = os.path.join(root, filename)[len(tempdir):]
            docid = int(path[:path.find('/')])
            path = path[len(str(docid))+1:] # drop the docid too for the rest
            if not filename.endswith('_caption.txt') and not filename.endswith('.txt') and not filename.startswith('.'):
                good = False
                for e in config.EXTS:
                    if filename.endswith(e):
                        good = True
                if not good: continue

                stem = path[:path.rfind('.')]
                ext = path[path.rfind('.')+1:]
                if docid not in files or stem not in files[docid]:
                    log.debug('File with no .txt caption: '+path)
                else:
                    files[docid][stem]['exts'].append(ext)

    # Save this information into the actual documents
    for doc in documents_curr:
        # Do it only if we are suppose to update or add this document
        if doc['id'] in documents_to_process:
            doc['files'] = []
            docid = doc['id']
            for base in files[docid]:
                # Append file only if there is a caption .txt file
                if len(files[docid][base]['exts']):
                    doc['files'].append(files[docid][base])
        # Else fetch info from previous json
        else:
            for doc_prev in documents_prev:
              if doc_prev['id'] == doc['id']:
                  doc['files'] = doc_prev['files']


#///////////////////////////////////////////////////////////////////////////////
# Make thumnails from downloaded images
def ProcessImages(documents_to_process, documents, tempDir):

    if len(documents_to_process) == 0:
        return 0

    idx = 0
    for docID in documents_to_process:

        idx += 1
        log.info('[%i/%i] Processing images for docdb %i' % (idx, len(documents_to_process), docID))

        document = next((doc for doc in documents if doc['id'] == docID), None)

        tempDocDir = tempDir + str(document['id']) + '/'
        thumDir = tempDocDir + '/thumbs/'
        os.mkdir(thumDir)

        for aFile in document['files']:
            base = aFile['base']
            exts = aFile['exts']

            # Favoured versions to make a thumbnail version from
            srcs = ['png', 'jpg', 'jpeg', 'eps', 'pdf', 'ps']


            for src in srcs:
                if src in exts:
                    # Create thumbnail
                    opt = ''
                    opt2 = None
                    if src == 'pdf':
                        opt = ' -define pdf:use-cropbox=true -transparent-color white '
                        opt2 = ' -transparent-color white ' # sometimes the cropbox is trouble
                    cmd = 'convert ' + opt + tempDocDir + base + '.'+src + ' -resize 400 -quiet ' + thumDir + base + '_thumb.png'
                    cmd2 = None
                    if opt2: cmd2 = 'convert ' + opt2 + tempDocDir + base + '.'+src + ' -resize 400 -quiet ' + thumDir + base + '_thumb.png'

                    # In case of tarballs etc there can be subdirs required
                    # in the thumbs directory. Maybe we should have made
                    # them up-front?
                    try:
                        os.makedirs(os.path.dirname(thumDir+base))
                    except:
                        pass

                    if cmd2:
                        os.system(cmd + ' || ' + cmd2)
                    else:
                        os.system(cmd)
                    log.success('Created thumbnail from '+base+'.'+src+': '+thumDir + base + '_thumb.png')
                    break

    os.system('cp -rpf ' + tempDir + '* ' + config.WEB_PATH + config.PLOT_SUBDIR)
    shutil.rmtree(tempDir)

    return 0


#///////////////////////////////////////////////////////////////////////////////
# Write JSON file to disk
def WriteJSON(documents):
    os.system('cp -pf ' + config.BLESSED_PLOTS + ' ' + config.WEB_PATH )
    log.success('Copied ' + config.BLESSED_PLOTS + ' to ' + config.WEB_PATH)
    jsonSerialized = json.dumps(documents, sort_keys = True, indent = 2)

    jsonFile = open(config.WEB_PATH + config.JSON_FILENAME, 'w')
    jsonFile.write(jsonSerialized)
    jsonFile.close()

    log.success('Wrote ' + config.WEB_PATH + config.JSON_FILENAME)