@@ -210,30 +210,19 @@ def perform_ocr(
210210 model_type = None ,
211211 enable_tikz = False ,
212212):
213- ocr_colnames = get_ocr_colnames (method )
214-
215- results = None
216- language = None
213+ text = None
217214
218215 if method == 'tesseract' :
219- res = perform_tesseract_ocr (file_path , language = 'enfr' )
216+ text = perform_tesseract_ocr (file_path , language = 'enfr' )
220217
221- if res :
222- language = detect_text_language (res )
223- results = [{'method' : ocr_colnames [0 ], 'text' : res }]
224218 elif method == 'google' and google_api_token :
225219 ocr_model = GoogleOCRModel (google_api_token )
226220 ocr_model .establish_connection ()
227- res1 , res2 = ocr_model .perform_ocr (file_path )
221+ text1 , text2 = ocr_model .perform_ocr (file_path )
222+
223+ # Since DTD usually performs better, method #1 is our point of reference for langdetect
224+ text = text1
228225
229- if res1 :
230- # Since DTD usually performs better, method #1 is our point of reference for langdetect
231- language = detect_text_language (res1 )
232- res_list = [res1 ]
233- results = [
234- {'method' : ocr_colnames [i ], 'text' : res_list [i ]}
235- for i in range (len (res_list ))
236- ]
237226 else :
238227 ocr_model = None
239228 if method == 'openai' and openai_api_token :
@@ -245,17 +234,14 @@ def perform_ocr(
245234
246235 if ocr_model :
247236 ocr_model .establish_connection ()
248- res = ocr_model .perform_ocr (
249- file_path , model_type = model_type , enable_tikz = enable_tikz
250- )
237+ text = ocr_model .perform_ocr (file_path , model_type = model_type , enable_tikz = enable_tikz )
251238
252- if res :
253- language = detect_text_language (res )
254- results = [{'method' : ocr_colnames [0 ], 'text' : res }]
239+ if not text :
240+ text = ''
255241
256242 return {
257- 'results' : results ,
258- 'language' : language ,
243+ 'results' : [{ 'method' : get_ocr_colnames ( method )[ 0 ], 'text' : text }] ,
244+ 'language' : detect_text_language ( text ) ,
259245 }
260246
261247
@@ -296,9 +282,7 @@ def extract_slide_text(
296282
297283
298284def extract_multi_image_text (
299- page_and_filename_list ,
300- i ,
301- n ,
285+ page_and_filename ,
302286 method = "google" ,
303287 google_api_token = None ,
304288 openai_api_token = None ,
@@ -307,44 +291,33 @@ def extract_multi_image_text(
307291 model_type = None ,
308292 enable_tikz = False ,
309293):
310- # Extract subset of pages to process
311- n_pages = len (page_and_filename_list )
312- start_index = int (i / n * n_pages )
313- end_index = int ((i + 1 ) / n * n_pages )
314- pages_to_handle = page_and_filename_list [start_index : end_index ]
315-
316- # Perform OCR on subset of pages
317- results = list ()
318- for page in pages_to_handle :
319- results .append (
320- perform_ocr (
321- page ["filename" ],
322- method ,
323- google_api_token ,
324- openai_api_token ,
325- gemini_api_token ,
326- rcp_api_token ,
327- model_type ,
328- enable_tikz ,
329- )
330- )
294+ # Perform OCR on page
295+ result = perform_ocr (
296+ page_and_filename ["filename" ],
297+ method ,
298+ google_api_token ,
299+ openai_api_token ,
300+ gemini_api_token ,
301+ rcp_api_token ,
302+ model_type ,
303+ enable_tikz ,
304+ )
305+
306+ print (f"Performed OCR on page { page_and_filename ['page' ]} . Result: { result } " )
331307
332308 # Build result and return it
333309 return {
334- 'results' : [
335- {
336- 'page' : pages_to_handle [i ]['page' ],
337- 'content' : results [i ]['results' ][0 ]['text' ]
338- }
339- for i in range (len (results ))
340- ],
341- 'language' : get_most_common_element ([result ['language' ] for result in results ]),
342- 'method' : get_most_common_element ([result ['results' ][0 ]['method' ] for result in results ])
310+ 'result' : {
311+ 'page' : page_and_filename ['page' ],
312+ 'content' : result ['results' ][0 ]['text' ]
313+ },
314+ 'language' : result ['language' ],
315+ 'method' : result ['results' ][0 ]['method' ],
343316 }
344317
345318
346319def collect_multi_image_ocr (results ):
347- all_results = list ( chain . from_iterable ( result ['results ' ] for result in results ))
320+ all_results = [ result ['result ' ] for result in results ]
348321 language = get_most_common_element ([result ['language' ] for result in results ])
349322 method = get_most_common_element ([result ['method' ] for result in results ])
350323 return {
0 commit comments