Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
package edu.harvard.iq.dataverse.datafile;

import edu.harvard.iq.dataverse.common.files.mime.MimeTypes;
import edu.harvard.iq.dataverse.ingest.IngestableDataChecker;
import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
import edu.harvard.iq.dataverse.util.JhoveFileType;
import edu.harvard.iq.dataverse.util.ShapefileHandler;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.Tika;
import org.apache.tika.mime.MediaType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static java.util.Collections.emptyMap;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.zip.GZIPInputStream;

import javax.activation.MimetypesFileTypeMap;
import javax.ejb.EJBException;
Expand All @@ -21,14 +21,20 @@
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.detect.Detector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.harvard.iq.dataverse.common.files.mime.MimeTypes;
import edu.harvard.iq.dataverse.ingest.IngestableDataChecker;
import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
import edu.harvard.iq.dataverse.util.JhoveFileType;
import edu.harvard.iq.dataverse.util.ShapefileHandler;

/**
* our check is fairly weak (it appears to be hard to really
Expand Down Expand Up @@ -83,7 +89,7 @@ public String determineFileType(File f, String fileName) throws IOException {

// step 3: check the mime type of this file with Tika
if (!isContentTypeSpecificEnough(fileType)) {
fileType = new Tika().detect(f);
fileType = detectWithTika(f);
}


Expand Down Expand Up @@ -154,6 +160,17 @@ public String detectTabularFileType(File file, String fallbackContentType) {
}

// -------------------- PRIVATE --------------------

private String detectWithTika(final File file) throws IOException {
final AutoDetectParser parser = new AutoDetectParser();
parser.setParsers(emptyMap()); // disable all parsing
final Detector detector = parser.getDetector();
final Metadata metadata = new Metadata();

try (final InputStream is = new BufferedInputStream(new FileInputStream(file))) {
return detector.detect(is, metadata).toString();
}
}

private boolean isContentTypeSpecificEnough(String contentType) {
return !"text/plain".equals(contentType) && !"application/octet-stream".equals(contentType);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import static javax.ejb.TransactionAttributeType.REQUIRES_NEW;

import java.io.IOException;
import java.io.InputStream;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
Expand Down Expand Up @@ -48,20 +47,12 @@
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

import edu.harvard.iq.dataverse.DatasetLinkingServiceBean;
import edu.harvard.iq.dataverse.DataverseDao;
import edu.harvard.iq.dataverse.DvObjectServiceBean;
import edu.harvard.iq.dataverse.citation.CitationFactory;
import edu.harvard.iq.dataverse.common.DatasetFieldConstant;
import edu.harvard.iq.dataverse.dataaccess.DataAccess;
import edu.harvard.iq.dataverse.dataaccess.StorageIO;
import edu.harvard.iq.dataverse.dataset.DatasetService;
import edu.harvard.iq.dataverse.dataverse.DataverseLinkingService;
import edu.harvard.iq.dataverse.persistence.DvObject;
Expand Down Expand Up @@ -90,7 +81,6 @@
import edu.harvard.iq.dataverse.search.periodo.PeriodoDataFinder;
import edu.harvard.iq.dataverse.search.query.SearchObjectType;
import edu.harvard.iq.dataverse.search.query.SearchPublicationStatus;
import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
import edu.harvard.iq.dataverse.util.FileUtil;
import edu.harvard.iq.dataverse.util.StringUtil;
import edu.harvard.iq.dataverse.util.SystemConfig;
Expand All @@ -117,11 +107,9 @@ public class IndexServiceBean {
private SolrIndexServiceBean solrIndexService;
private DatasetLinkingServiceBean dsLinkingService;
private DataverseLinkingService dvLinkingService;
private SettingsServiceBean settingsService;
private SolrClient solrServer;
private CitationFactory citationFactory;

private DataAccess dataAccess = DataAccess.dataAccess();
private GeoboxIndexUtil geoboxIndexUtil = new GeoboxIndexUtil();
private GeoNameDataFinder geonames;
private PeriodoDataFinder periods;
Expand All @@ -138,7 +126,6 @@ public IndexServiceBean(DvObjectServiceBean dvObjectService,
SolrIndexServiceBean solrIndexService,
DatasetLinkingServiceBean dsLinkingService,
DataverseLinkingService dvLinkingService,
SettingsServiceBean settingsService,
SolrClient solrServer,
CitationFactory citationFactory,
GeoNameDataFinder geonames,
Expand All @@ -150,7 +137,6 @@ public IndexServiceBean(DvObjectServiceBean dvObjectService,
this.solrIndexService = solrIndexService;
this.dsLinkingService = dsLinkingService;
this.dvLinkingService = dvLinkingService;
this.settingsService = settingsService;
this.solrServer = solrServer;
this.citationFactory = citationFactory;
this.geonames = geonames;
Expand Down Expand Up @@ -850,8 +836,6 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) {
&& dsf.getValues().get(0) != null && solrFieldSearchable != null) {

logger.fine("indexing " + dsf.getTypeName() + ':' + dsf.getValues() + " into " + solrFieldSearchable + " and maybe " + solrFieldFacetable);
// if (dsfType.getSolrField().getSolrType().equals(SolrField.SolrType.INTEGER))
// {
if (SolrField.SolrType.EMAIL.equals(dsfSolrField.getSolrType())) {
// no-op. we want to keep email address out of Solr per
// https://github.com/IQSS/dataverse/issues/759
Expand All @@ -870,12 +854,8 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) {
SimpleDateFormat yearOnly = new SimpleDateFormat("yyyy");
String datasetFieldFlaggedAsDate = yearOnly.format(dateAsDate);
logger.fine("YYYY only: " + datasetFieldFlaggedAsDate);
// solrInputDocument.addField(solrFieldSearchable,
// Integer.parseInt(datasetFieldFlaggedAsDate));
solrInputDocument.addField(solrFieldSearchable, dateAsString);
if (dsfSolrField.isFacetable()) {
// solrInputDocument.addField(solrFieldFacetable,
// Integer.parseInt(datasetFieldFlaggedAsDate));
solrInputDocument.addField(solrFieldFacetable, datasetFieldFlaggedAsDate);
}
} catch (Exception ex) {
Expand Down Expand Up @@ -981,10 +961,6 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) {
/*
File Indexing
*/
boolean doFullTextIndexing = settingsService.isTrueForKey(SettingsServiceBean.Key.SolrFullTextIndexing);
Long maxFTIndexingSize = settingsService.getValueForKeyAsLong(SettingsServiceBean.Key.SolrMaxFileSizeForFullTextIndexing);
long maxSize = (maxFTIndexingSize == 0) ? maxFTIndexingSize : Long.MAX_VALUE;

List<String> filesIndexed = new ArrayList<>();
if (datasetVersion != null) {
List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
Expand Down Expand Up @@ -1052,55 +1028,6 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) {
datafileSolrInputDocument.addField(SearchFields.EMBARGO_UNTIL, dataset.getEmbargoDate().get());
}

/* Full-text indexing using Apache Tika */
if (doFullTextIndexing) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will we add full text indexing for files using some different ways or we don't see usage of this feature?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't use this feature in any installation. We don't need it. That's why the decision was to remove it.

if (!dataset.isHarvested() && !fileMetadata.isFileUseRestricted()
&& !fileMetadata.getDataFile().isFilePackage()) {
StorageIO<DataFile> accessObject;
InputStream instream = null;
ContentHandler textHandler;
try {
accessObject = dataAccess.getStorageIO(fileMetadata.getDataFile());
if (accessObject != null) {
accessObject.open();
// If the size is >max, we don't use the stream. However, for S3, the stream is
// currently opened in the call above (see
// https://github.com/IQSS/dataverse/issues/5165), so we want to get a handle so
// we can close it below.
instream = accessObject.getInputStream();
if (accessObject.getSize() <= maxSize) {
AutoDetectParser autoParser = new AutoDetectParser();
textHandler = new BodyContentHandler(-1);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
/*
* Try parsing the file. Note that, other than by limiting size, there's been no
* check see whether this file is a good candidate for text extraction (e.g.
* based on type).
*/
autoParser.parse(instream, textHandler, metadata, context);
datafileSolrInputDocument.addField(SearchFields.FULL_TEXT,
textHandler.toString());
}
}
} catch (Exception e) {
// Needs better logging of what went wrong in order to
// track down "bad" documents.
logger.warning(String.format("Full-text indexing for %s failed",
fileMetadata.getDataFile().getDisplayName()));
e.printStackTrace();
continue;
} catch (OutOfMemoryError e) {
textHandler = null;
logger.warning(String.format("Full-text indexing for %s failed due to OutOfMemoryError",
fileMetadata.getDataFile().getDisplayName()));
continue;
} finally {
IOUtils.closeQuietly(instream);
}
}
}

String filenameCompleteFinal = "";
String filenameComplete = fileMetadata.getLabel();
if (filenameComplete != null) {
Expand Down Expand Up @@ -1186,7 +1113,6 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) {
if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.PUBLISHED)) {
fileSolrDocId = solrDocIdentifierFile + fileEntityId;
datafileSolrInputDocument.addField(SearchFields.PUBLICATION_STATUS, SearchPublicationStatus.PUBLISHED.getSolrValue());
// datafileSolrInputDocument.addField(SearchFields.PERMS, publicGroupString);
addDatasetReleaseDateToSolrDoc(datafileSolrInputDocument, dataset);
} else if (indexableDataset.getDatasetState().equals(IndexableDataset.DatasetState.WORKING_COPY)) {
fileSolrDocId = solrDocIdentifierFile + fileEntityId + indexableDataset.getDatasetState().getSuffix();
Expand All @@ -1203,11 +1129,6 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) {
datafileSolrInputDocument.addField(SearchFields.FILE_TYPE_SEARCHABLE, FileUtil.getFacetFileTypeForIndex(fileMetadata.getDataFile(), Locale.ENGLISH));
datafileSolrInputDocument.addField(SearchFields.FILE_SIZE_IN_BYTES, fileMetadata.getDataFile().getFilesize());
if (DataFile.ChecksumType.MD5.equals(fileMetadata.getDataFile().getChecksumType())) {
/*
@todo Someday we should probably deprecate this
* FILE_MD5 in favor of a combination of
* FILE_CHECKSUM_TYPE and FILE_CHECKSUM_VALUE.
*/
datafileSolrInputDocument.addField(SearchFields.FILE_MD5, fileMetadata.getDataFile().getChecksumValue());
}
datafileSolrInputDocument.addField(SearchFields.FILE_CHECKSUM_TYPE, fileMetadata.getDataFile().getChecksumType().toString());
Expand All @@ -1217,10 +1138,6 @@ private String addOrUpdateDataset(IndexableDataset indexableDataset) {
datafileSolrInputDocument.addField(SearchFields.FILE_PERSISTENT_ID, fileMetadata.getDataFile().getGlobalId().toString());
datafileSolrInputDocument.addField(SearchFields.UNF, fileMetadata.getDataFile().getUnf());
datafileSolrInputDocument.addField(SearchFields.SUBTREE, dataversePaths);
// datafileSolrInputDocument.addField(SearchFields.HOST_DATAVERSE,
// dataFile.getOwner().getOwner().getName());
// datafileSolrInputDocument.addField(SearchFields.PARENT_NAME,
// dataFile.getDataset().getTitle());
datafileSolrInputDocument.addField(SearchFields.PARENT_ID, fileMetadata.getDataFile().getOwner().getId());
datafileSolrInputDocument.addField(SearchFields.PARENT_IDENTIFIER, fileMetadata.getDataFile().getOwner().getGlobalId().toString());
for (Locale locale : configuredLocales) {
Expand Down Expand Up @@ -1494,7 +1411,6 @@ private List<String> findFilesOfParentDataset(long parentDatasetId) throws Searc
solrQuery.setFields(SearchFields.ID);
solrQuery.setRows(Integer.MAX_VALUE);
solrQuery.addFilterQuery(SearchFields.PARENT_ID + ':' + parentDatasetId);
// todo "files" should be a constant
solrQuery.addFilterQuery(SearchFields.TYPE + ':' + SearchObjectType.FILES.getSolrValue());
List<String> dvObjectInSolrOnly = new ArrayList<>();
QueryResponse queryResponse;
Expand Down