qultoltd · msutya · Mar 20, 2024 · Mar 23, 2024 · Apr 11, 2024 · Apr 11, 2024
diff --git a/dspace-api/pom.xml b/dspace-api/pom.xml
@@ -19,6 +19,7 @@
     <properties>
         <!-- This is the path to the root [dspace-src] directory. -->
         <root.basedir>${basedir}/..</root.basedir>
+        <jackson-dataformat.version>2.12.7</jackson-dataformat.version>
     </properties>
 
     <!--
@@ -623,6 +624,22 @@
             <artifactId>tika-parsers-standard-package</artifactId>
         </dependency>
 
+        <dependency>
+            <groupId>com.fasterxml.jackson.dataformat</groupId>
+            <artifactId>jackson-dataformat-xml</artifactId>
+            <version>${jackson-dataformat.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>com.fasterxml.woodstox</groupId>
+                    <artifactId>woodstox-core</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>jakarta.activation</groupId>
+                    <artifactId>jakarta.activation-api</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+
         <dependency>
             <groupId>com.maxmind.geoip2</groupId>
             <artifactId>geoip2</artifactId>

diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/StructuredPdfTextExtractionFilter.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/StructuredPdfTextExtractionFilter.java
@@ -0,0 +1,104 @@
+/**
+ * The contents of this file are subject to the license and copyright
+ * detailed in the LICENSE and NOTICE files at the root of the source
+ * tree and available online at
+ *
+ * http://www.dspace.org/license/
+ */
+package org.dspace.app.mediafilter;
+
+import com.fasterxml.jackson.databind.SerializationFeature;
+import com.fasterxml.jackson.dataformat.xml.XmlMapper;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import org.apache.commons.lang.StringUtils;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.pdfbox.multipdf.Splitter;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.dspace.app.mediafilter.model.Page;
+import org.dspace.app.mediafilter.model.Pages;
+import org.dspace.authorize.AuthorizeException;
+import org.dspace.content.Bitstream;
+import org.dspace.content.Item;
+import org.dspace.content.service.BitstreamService;
+import org.dspace.core.Context;
+import org.dspace.services.ConfigurationService;
+import org.dspace.services.factory.DSpaceServicesFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.xml.sax.SAXException;
+
+public class StructuredPdfTextExtractionFilter extends MediaFilter {
+  private final Splitter splitter = new Splitter();
+  private final XmlMapper xmlMapper = new XmlMapper();
+  private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH_mm_ss_SSS");
+
+  @Override
+  public String getFilteredName(String oldFileName) {
+    return oldFileName + ".xml";
+  }
+
+  @Override
+  public String getBundleName() {
+    return "STRUCTURED_TEXT";
+  }
+
+  @Override
+  public String getFormatString() {
+    return "XML";
+  }
+
+  @Override
+  public String getDescription() {
+    return "Extracted Structured Text";
+  }
+
+  @Override
+  public InputStream getDestinationStream(final Item item, final InputStream source, final boolean verbose)
+    throws Exception {
+
+    PDDocument document = PDDocument.load(source);
+    List<PDDocument> splitPages = splitter.split(document);
+
+    PDFTextStripper stripper = new PDFTextStripper();
+    List<Page> pageTexts = new ArrayList<>();
+
+    for (int i = 0; i < splitPages.size(); i++) {
+      Page page = new Page(i + 1, stripper.getText(splitPages.get(i)));
+      pageTexts.add(page);
+    }
+
+    Pages pages = new Pages(pageTexts);
+
+    xmlMapper.enable(SerializationFeature.INDENT_OUTPUT);
+    File tempFile = File.createTempFile("dspacetextextract" + dateFormat.format(new Date()), ".xml");
+    xmlMapper.writeValue(tempFile, pages);
+
+    return Files.newInputStream(Path.of(tempFile.getAbsolutePath()));
+  }
+
+  @Override
+  public boolean preProcessBitstream(Context c, Item item, Bitstream source, boolean verbose) throws SQLException {
+    return "application/pdf".equals(source.getFormat(c).getMIMEType());
+  }
+
+}
diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/model/Page.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/model/Page.java
@@ -0,0 +1,45 @@
+/**
+ * The contents of this file are subject to the license and copyright
+ * detailed in the LICENSE and NOTICE files at the root of the source
+ * tree and available online at
+ *
+ * http://www.dspace.org/license/
+ */
+package org.dspace.app.mediafilter.model;
+
+import java.util.Objects;
+
+public class Page {
+
+  private int pageNumber;
+  private String text;
+
+  public Page(){}
+  public Page(int pageNumber, String text) {
+    this.pageNumber = pageNumber;
+    this.text = text;
+  }
+
+  public int getPageNumber() {
+    return pageNumber;
+  }
+
+  public String getText() {
+    return text;
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    final Page page = (Page) o;
+    return pageNumber == page.pageNumber && Objects.equals(text, page.text);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(pageNumber, text);
+  }
+}
diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/model/Pages.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/model/Pages.java
@@ -0,0 +1,46 @@
+/**
+ * The contents of this file are subject to the license and copyright
+ * detailed in the LICENSE and NOTICE files at the root of the source
+ * tree and available online at
+ *
+ * http://www.dspace.org/license/
+ */
+package org.dspace.app.mediafilter.model;
+
+import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlElementWrapper;
+import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlProperty;
+import com.fasterxml.jackson.dataformat.xml.annotation.JacksonXmlRootElement;
+import java.util.List;
+import java.util.Objects;
+
+@JacksonXmlRootElement(localName = "pages")
+public class Pages {
+
+  @JacksonXmlProperty(localName = "page")
+  @JacksonXmlElementWrapper(useWrapping = false)
+  private List<Page> pageList;
+
+  public Pages(){}
+  public Pages(final List<Page> pageList) {
+    this.pageList = pageList;
+  }
+
+  public List<Page> getPageList() {
+    return pageList;
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    final Pages pages = (Pages) o;
+    return Objects.equals(pageList, pages.pageList);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(pageList);
+  }
+}
diff --git a/...e-api/src/test/java/org/dspace/app/mediafilter/StructuredPdfTextExtractionFilterTest.java b/...e-api/src/test/java/org/dspace/app/mediafilter/StructuredPdfTextExtractionFilterTest.java
@@ -0,0 +1,94 @@
+/**
+ * The contents of this file are subject to the license and copyright
+ * detailed in the LICENSE and NOTICE files at the root of the source
+ * tree and available online at
+ *
+ * http://www.dspace.org/license/
+ */
+package org.dspace.app.mediafilter;
+
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.*;
+
+import com.fasterxml.jackson.dataformat.xml.XmlMapper;
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+
+import java.sql.SQLException;
+import org.dspace.app.mediafilter.model.Page;
+import org.dspace.app.mediafilter.model.Pages;
+import org.dspace.content.Bitstream;
+import org.dspace.content.BitstreamFormat;
+import org.dspace.content.Item;
+import org.dspace.core.Context;
+import org.dspace.services.ConfigurationService;
+import org.dspace.services.factory.DSpaceServicesFactory;
+import org.junit.Test;
+
+public class StructuredPdfTextExtractionFilterTest {
+
+  private static final StructuredPdfTextExtractionFilter filter = new StructuredPdfTextExtractionFilter();
+  private static final XmlMapper xmlMapper = new XmlMapper();
+
+  @Test
+  public void testGetFilteredName() {
+    assertEquals("multipage_test.pdf.xml", filter.getFilteredName("multipage_test.pdf"));
+  }
+
+  @Test
+  public void testGetBundleName() {
+    assertEquals("STRUCTURED_TEXT", filter.getBundleName());
+  }
+
+  @Test
+  public void testGetFormatString() {
+    assertEquals("XML", filter.getFormatString());
+  }
+
+  @Test
+  public void testGetDescription() {
+    assertEquals("Extracted Structured Text", filter.getDescription());
+  }
+
+  @Test
+  public void testGetDestinationStream() throws Exception {
+    Item item = mock(Item.class);
+
+    InputStream resultStream = filter.getDestinationStream(item, getMultiPagePDF(), true);
+
+    assertNotNull(resultStream);
+
+    InputStream expectedInputStream = getExpectedXml();
+    Pages expectedPages = xmlMapper.readValue(expectedInputStream, Pages.class);
+    Pages resultPages = xmlMapper.readValue(resultStream, Pages.class);
+
+    assertEquals(expectedPages, resultPages);
+
+    resultStream.close();
+  }
+
+  @Test
+  public void testPreProcessBitstream() throws SQLException {
+    Context context = mock(Context.class);
+    Item item = mock(Item.class);
+
+    Bitstream source = mock(Bitstream.class);
+    BitstreamFormat bsFormat = mock(BitstreamFormat.class);
+    when(source.getFormat(context)).thenReturn(bsFormat);
+    when(bsFormat.getMIMEType()).thenReturn("application/pdf");
+
+    assertTrue(filter.preProcessBitstream(context, item, source, true));
+
+    when(bsFormat.getMIMEType()).thenReturn("image/png");
+    assertFalse(filter.preProcessBitstream(context, item, source, true));
+  }
+
+  private InputStream getMultiPagePDF() {
+    return getClass().getResourceAsStream("multipage_test.pdf");
+  }
+
+  private InputStream getExpectedXml() {
+    return getClass().getResourceAsStream("multipage_expected_result.xml");
+  }
+
+}
diff --git a/dspace-api/src/test/resources/org/dspace/app/mediafilter/multipage_expected_result.xml b/dspace-api/src/test/resources/org/dspace/app/mediafilter/multipage_expected_result.xml
@@ -0,0 +1,27 @@
+<pages>
+  <page>
+    <pageNumber>1</pageNumber>
+    <text>A Text Extraction Test Document&#xd;
+for&#xd;
+DSpace&#xd;
+This is a text. For the next sixty seconds this software will conduct a test of the DSpace text&#xd;
+extraction facility.&#xd;
+This is only a text. This is a paragraph that followed the first that lived in the document that&#xd;
+Jack built.&#xd;
+Lorem ipsum dolor sit amet. The quick brown fox jumped over the lazy dog. Yow! Are we&#xd;
+having fun yet?&#xd;
+This has been a test of the DSpace text extraction system. In the event of actual content you&#xd;
+would care what is written here&#xd;
+</text>
+  </page>
+  <page>
+    <pageNumber>2</pageNumber>
+    <text>This is still a text.&#xd;
+This is only a text, but on a separate page. This is a paragraph that followed the first that&#xd;
+lived in the document that Jack built.&#xd;
+Lorem ipsum dolor sit amet. The quick brown fox jumped over the lazy dog.&#xd;
+This has been a test of the DSpace structured text extraction system. In the event of actual&#xd;
+content you would care what is written here&#xd;
+</text>
+  </page>
+</pages>
diff --git a/dspace-api/src/test/resources/org/dspace/app/mediafilter/multipage_test.pdf b/dspace-api/src/test/resources/org/dspace/app/mediafilter/multipage_test.pdf
diff --git a/dspace/config/dspace.cfg b/dspace/config/dspace.cfg
@@ -27,14 +27,15 @@ csvexport.dir = ${dspace.dir}/exports
 # NOTE: This URL must be accessible to all DSpace users (should not use 'localhost' in Production)
 # and is usually "synced" with the "rest" section in the DSpace User Interface's config.*.yml.
 # It corresponds to the URL that you would type into your browser to access the REST API.
-dspace.server.url = http://localhost:8080/server
+# 192.168.1.7
+dspace.server.url = http://10.33.0.5:8080/server
 
 # Public URL of DSpace frontend (Angular UI). May require a port number if not using standard ports (80 or 443)
 # DO NOT end it with '/'.
 # This is used by the backend to provide links in emails, RSS feeds, Sitemaps, etc.
 # NOTE: this URL must be accessible to all DSpace users (should not use 'localhost' in Production).
 # It corresponds to the URL that you would type into your browser to access the User Interface.
-dspace.ui.url = http://localhost:4000
+dspace.ui.url = http://10.33.0.5:4000
 
 # Name of the site
 dspace.name = DSpace at My University
@@ -445,6 +446,7 @@ useProxies = true
 
 #Names of the enabled MediaFilter or FormatFilter plugins
 filter.plugins = Text Extractor
+#filter.plugins = Structured Text Extractor
 filter.plugins = JPEG Thumbnail
 filter.plugins = PDFBox JPEG Thumbnail
 
@@ -465,6 +467,7 @@ filter.plugins = PDFBox JPEG Thumbnail
 
 #Assign 'human-understandable' names to each filter
 plugin.named.org.dspace.app.mediafilter.FormatFilter = org.dspace.app.mediafilter.TikaTextExtractionFilter = Text Extractor
+#plugin.named.org.dspace.app.mediafilter.FormatFilter = org.dspace.app.mediafilter.StructuredTextExtractionFilter = Structured Text Extractor
 plugin.named.org.dspace.app.mediafilter.FormatFilter = org.dspace.app.mediafilter.JPEGFilter = JPEG Thumbnail
 plugin.named.org.dspace.app.mediafilter.FormatFilter = org.dspace.app.mediafilter.BrandedPreviewJPEGFilter = Branded Preview JPEG
 plugin.named.org.dspace.app.mediafilter.FormatFilter = org.dspace.app.mediafilter.PDFBoxThumbnail = PDFBox JPEG Thumbnail
@@ -490,6 +493,7 @@ filter.org.dspace.app.mediafilter.TikaTextExtractionFilter.inputFormats = OpenDo
 filter.org.dspace.app.mediafilter.TikaTextExtractionFilter.inputFormats = OpenDocument Text
 filter.org.dspace.app.mediafilter.TikaTextExtractionFilter.inputFormats = RTF
 filter.org.dspace.app.mediafilter.TikaTextExtractionFilter.inputFormats = Text
+#filter.org.dspace.app.mediafilter.StructuredTextExtractionFilter.inputFormats = Adobe PDF
 filter.org.dspace.app.mediafilter.JPEGFilter.inputFormats = BMP, GIF, JPEG, PNG
 filter.org.dspace.app.mediafilter.BrandedPreviewJPEGFilter.inputFormats = BMP, GIF, JPEG, PNG
 filter.org.dspace.app.mediafilter.ImageMagickImageThumbnailFilter.inputFormats = BMP, GIF, PNG, JPG, TIFF, JPEG, JPEG 2000