Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions dspace-api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
<properties>
<!-- This is the path to the root [dspace-src] directory. -->
<root.basedir>${basedir}/..</root.basedir>
<jackson-dataformat.version>2.12.7</jackson-dataformat.version>
</properties>

<!--
Expand Down Expand Up @@ -623,6 +624,22 @@
<artifactId>tika-parsers-standard-package</artifactId>
</dependency>

<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-xml</artifactId>
<version>${jackson-dataformat.version}</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.woodstox</groupId>
<artifactId>woodstox-core</artifactId>
</exclusion>
<exclusion>
<groupId>jakarta.activation</groupId>
<artifactId>jakarta.activation-api</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>com.maxmind.geoip2</groupId>
<artifactId>geoip2</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;

import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.dataformat.xml.XmlMapper;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.dspace.app.mediafilter.model.Page;
import org.dspace.app.mediafilter.model.Pages;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.Item;
import org.dspace.content.service.BitstreamService;
import org.dspace.core.Context;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.xml.sax.SAXException;

public class StructuredPdfTextExtractionFilter extends MediaFilter {
private final Splitter splitter = new Splitter();
private final XmlMapper xmlMapper = new XmlMapper();
private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH_mm_ss_SSS");

@Override
public String getFilteredName(String oldFileName) {
return oldFileName + ".xml";
}

@Override
public String getBundleName() {
return "STRUCTURED_TEXT";
}

@Override
public String getFormatString() {
return "XML";
}

@Override
public String getDescription() {
return "Extracted Structured Text";
}

@Override
public InputStream getDestinationStream(final Item item, final InputStream source, final boolean verbose)
throws Exception {

PDDocument document = PDDocument.load(source);
List<PDDocument> splitPages = splitter.split(document);

PDFTextStripper stripper = new PDFTextStripper();
List<Page> pageTexts = new ArrayList<>();

for (int i = 0; i < splitPages.size(); i++) {
Page page = new Page(i + 1, stripper.getText(splitPages.get(i)));
pageTexts.add(page);
}

Pages pages = new Pages(pageTexts);

xmlMapper.enable(SerializationFeature.INDENT_OUTPUT);
File tempFile = File.createTempFile("dspacetextextract" + dateFormat.format(new Date()), ".xml");
xmlMapper.writeValue(tempFile, pages);

return Files.newInputStream(Path.of(tempFile.getAbsolutePath()));
}

@Override
public boolean preProcessBitstream(Context c, Item item, Bitstream source, boolean verbose) throws SQLException {
return "application/pdf".equals(source.getFormat(c).getMIMEType());
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter.model;

import java.util.Objects;

public class Page {

private int pageNumber;
private String text;

public Page(){}
public Page(int pageNumber, String text) {
this.pageNumber = pageNumber;
this.text = text;
}

public int getPageNumber() {
Comment thread
kanasznagyzoltan marked this conversation as resolved.
return pageNumber;
}

public String getText() {
return text;
}

@Override
public boolean equals(final Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
final Page page = (Page) o;
return pageNumber == page.pageNumber && Objects.equals(text, page.text);
}

@Override
public int hashCode() {
return Objects.hash(pageNumber, text);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter.model;

import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlElementWrapper;
import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlProperty;
import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlRootElement;
import java.util.List;
import java.util.Objects;

@JacksonXmlRootElement(localName = "pages")
public class Pages {

@JacksonXmlProperty(localName = "page")
@JacksonXmlElementWrapper(useWrapping = false)
private List<Page> pageList;

public Pages(){}
public Pages(final List<Page> pageList) {
this.pageList = pageList;
}

public List<Page> getPageList() {
return pageList;
}

@Override
public boolean equals(final Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
final Pages pages = (Pages) o;
return Objects.equals(pageList, pages.pageList);
}

@Override
public int hashCode() {
return Objects.hash(pageList);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.app.mediafilter;

import static org.junit.Assert.*;
import static org.mockito.Mockito.*;

import com.fasterxml.jackson.dataformat.xml.XmlMapper;
import java.io.ByteArrayInputStream;
import java.io.InputStream;

import java.sql.SQLException;
import org.dspace.app.mediafilter.model.Page;
import org.dspace.app.mediafilter.model.Pages;
import org.dspace.content.Bitstream;
import org.dspace.content.BitstreamFormat;
import org.dspace.content.Item;
import org.dspace.core.Context;
import org.dspace.services.ConfigurationService;
import org.dspace.services.factory.DSpaceServicesFactory;
import org.junit.Test;

public class StructuredPdfTextExtractionFilterTest {

private static final StructuredPdfTextExtractionFilter filter = new StructuredPdfTextExtractionFilter();
private static final XmlMapper xmlMapper = new XmlMapper();

@Test
public void testGetFilteredName() {
assertEquals("multipage_test.pdf.xml", filter.getFilteredName("multipage_test.pdf"));
}

@Test
public void testGetBundleName() {
assertEquals("STRUCTURED_TEXT", filter.getBundleName());
}

@Test
public void testGetFormatString() {
assertEquals("XML", filter.getFormatString());
}

@Test
public void testGetDescription() {
assertEquals("Extracted Structured Text", filter.getDescription());
}

@Test
public void testGetDestinationStream() throws Exception {
Item item = mock(Item.class);

InputStream resultStream = filter.getDestinationStream(item, getMultiPagePDF(), true);

assertNotNull(resultStream);

InputStream expectedInputStream = getExpectedXml();
Pages expectedPages = xmlMapper.readValue(expectedInputStream, Pages.class);
Pages resultPages = xmlMapper.readValue(resultStream, Pages.class);

assertEquals(expectedPages, resultPages);

resultStream.close();
}

@Test
public void testPreProcessBitstream() throws SQLException {
Context context = mock(Context.class);
Item item = mock(Item.class);

Bitstream source = mock(Bitstream.class);
BitstreamFormat bsFormat = mock(BitstreamFormat.class);
when(source.getFormat(context)).thenReturn(bsFormat);
when(bsFormat.getMIMEType()).thenReturn("application/pdf");

assertTrue(filter.preProcessBitstream(context, item, source, true));

when(bsFormat.getMIMEType()).thenReturn("image/png");
assertFalse(filter.preProcessBitstream(context, item, source, true));
}

private InputStream getMultiPagePDF() {
return getClass().getResourceAsStream("multipage_test.pdf");
}

private InputStream getExpectedXml() {
return getClass().getResourceAsStream("multipage_expected_result.xml");
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<pages>
<page>
<pageNumber>1</pageNumber>
<text>A Text Extraction Test Document&#xd;
for&#xd;
DSpace&#xd;
This is a text. For the next sixty seconds this software will conduct a test of the DSpace text&#xd;
extraction facility.&#xd;
This is only a text. This is a paragraph that followed the first that lived in the document that&#xd;
Jack built.&#xd;
Lorem ipsum dolor sit amet. The quick brown fox jumped over the lazy dog. Yow! Are we&#xd;
having fun yet?&#xd;
This has been a test of the DSpace text extraction system. In the event of actual content you&#xd;
would care what is written here&#xd;
</text>
</page>
<page>
<pageNumber>2</pageNumber>
<text>This is still a text.&#xd;
This is only a text, but on a separate page. This is a paragraph that followed the first that&#xd;
lived in the document that Jack built.&#xd;
Lorem ipsum dolor sit amet. The quick brown fox jumped over the lazy dog.&#xd;
This has been a test of the DSpace structured text extraction system. In the event of actual&#xd;
content you would care what is written here&#xd;
</text>
</page>
</pages>
Binary file not shown.
8 changes: 6 additions & 2 deletions dspace/config/dspace.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ csvexport.dir = ${dspace.dir}/exports
# NOTE: This URL must be accessible to all DSpace users (should not use 'localhost' in Production)
# and is usually "synced" with the "rest" section in the DSpace User Interface's config.*.yml.
# It corresponds to the URL that you would type into your browser to access the REST API.
dspace.server.url = http://localhost:8080/server
# 192.168.1.7
dspace.server.url = http://10.33.0.5:8080/server

# Public URL of DSpace frontend (Angular UI). May require a port number if not using standard ports (80 or 443)
# DO NOT end it with '/'.
# This is used by the backend to provide links in emails, RSS feeds, Sitemaps, etc.
# NOTE: this URL must be accessible to all DSpace users (should not use 'localhost' in Production).
# It corresponds to the URL that you would type into your browser to access the User Interface.
dspace.ui.url = http://localhost:4000
dspace.ui.url = http://10.33.0.5:4000

# Name of the site
dspace.name = DSpace at My University
Expand Down Expand Up @@ -445,6 +446,7 @@ useProxies = true

#Names of the enabled MediaFilter or FormatFilter plugins
filter.plugins = Text Extractor
#filter.plugins = Structured Text Extractor
filter.plugins = JPEG Thumbnail
filter.plugins = PDFBox JPEG Thumbnail

Expand All @@ -465,6 +467,7 @@ filter.plugins = PDFBox JPEG Thumbnail

#Assign 'human-understandable' names to each filter
plugin.named.org.dspace.app.mediafilter.FormatFilter = org.dspace.app.mediafilter.TikaTextExtractionFilter = Text Extractor
#plugin.named.org.dspace.app.mediafilter.FormatFilter = org.dspace.app.mediafilter.StructuredTextExtractionFilter = Structured Text Extractor
plugin.named.org.dspace.app.mediafilter.FormatFilter = org.dspace.app.mediafilter.JPEGFilter = JPEG Thumbnail
plugin.named.org.dspace.app.mediafilter.FormatFilter = org.dspace.app.mediafilter.BrandedPreviewJPEGFilter = Branded Preview JPEG
plugin.named.org.dspace.app.mediafilter.FormatFilter = org.dspace.app.mediafilter.PDFBoxThumbnail = PDFBox JPEG Thumbnail
Expand All @@ -490,6 +493,7 @@ filter.org.dspace.app.mediafilter.TikaTextExtractionFilter.inputFormats = OpenDo
filter.org.dspace.app.mediafilter.TikaTextExtractionFilter.inputFormats = OpenDocument Text
filter.org.dspace.app.mediafilter.TikaTextExtractionFilter.inputFormats = RTF
filter.org.dspace.app.mediafilter.TikaTextExtractionFilter.inputFormats = Text
#filter.org.dspace.app.mediafilter.StructuredTextExtractionFilter.inputFormats = Adobe PDF
filter.org.dspace.app.mediafilter.JPEGFilter.inputFormats = BMP, GIF, JPEG, PNG
filter.org.dspace.app.mediafilter.BrandedPreviewJPEGFilter.inputFormats = BMP, GIF, JPEG, PNG
filter.org.dspace.app.mediafilter.ImageMagickImageThumbnailFilter.inputFormats = BMP, GIF, PNG, JPG, TIFF, JPEG, JPEG 2000
Expand Down