Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Nov 3, 2022
2 parents 89f0821 + 9911dd9 commit d9040f4
Show file tree
Hide file tree
Showing 84 changed files with 261 additions and 222 deletions.
2 changes: 0 additions & 2 deletions tika-app/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,10 @@
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>${log4j2.version}</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<version>${log4j2.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
Expand Down
3 changes: 0 additions & 3 deletions tika-batch/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,10 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>${commons.compress.version}</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>${commons.cli.version}</version>
</dependency>

<dependency>
Expand All @@ -67,7 +65,6 @@
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>${slf4j.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
Expand Down
3 changes: 0 additions & 3 deletions tika-bundles/tika-bundle-standard/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -107,22 +107,19 @@
<dependency>
<groupId>org.osgi</groupId>
<artifactId>org.osgi.core</artifactId>
<version>${osgi.core.version}</version>
<scope>test</scope>
</dependency>

<!-- use non-log4j slf4j backend to prevent main classloader from loading log4j classes -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>${slf4j.version}</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.glassfish.jaxb</groupId>
<artifactId>jaxb-runtime</artifactId>
<version>${jaxb.version}</version>
<scope>test</scope>
</dependency>
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException

input.mark(maxBytes);
try (TemporaryResources tmp = new TemporaryResources()) {
Path tmpFile = tmp.createTempFile();
Path tmpFile = tmp.createTempFile(metadata);
Files.copy(new BoundedInputStream(maxBytes, input), tmpFile, REPLACE_EXISTING);
return detectOnPath(tmpFile, metadata);
} finally {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ public void parse(
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);

// Figure out what we have to process
String filename = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ public void parseEmbedded(

// Use the delegate parser to parse this entry
try (TemporaryResources tmp = new TemporaryResources()) {
final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
final TikaInputStream newStream =
TikaInputStream.get(new CloseShieldInputStream(stream), tmp, metadata);
if (stream instanceof TikaInputStream) {
final Object container = ((TikaInputStream) stream).getOpenContainer();
if (container != null) {
Expand Down
21 changes: 19 additions & 2 deletions tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import java.util.HashSet;
import java.util.Locale;

import org.apache.tika.utils.StringUtils;


public class FilenameUtils {

Expand Down Expand Up @@ -93,7 +95,7 @@ public static String normalize(final String name) {
public static String getName(final String path) {

if (path == null || path.length() == 0) {
return "";
return StringUtils.EMPTY;
}
int unix = path.lastIndexOf("/");
int windows = path.lastIndexOf("\\");
Expand All @@ -102,8 +104,23 @@ public static String getName(final String path) {
int colon = path.lastIndexOf(":");
String cand = path.substring(Math.max(colon, Math.max(unix, windows)) + 1);
if (cand.equals("..") || cand.equals(".")) {
return "";
return StringUtils.EMPTY;
}
return cand;
}

/**
* This includes the period, e.g. ".pdf"
* @param path
* @return the suffix or an empty string if one could not be found
*/
public static String getSuffixFromPath(String path) {
String n = getName(path);
int i = n.lastIndexOf(".");
//arbitrarily sets max extension length
if (i > -1 && n.length() - i < 6) {
return n.substring(i);
}
return StringUtils.EMPTY;
}
}
39 changes: 31 additions & 8 deletions tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
import org.slf4j.LoggerFactory;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.utils.StringUtils;

/**
* Utility class for tracking and ultimately closing or otherwise disposing
Expand All @@ -52,7 +55,7 @@ public class TemporaryResources implements Closeable {

/**
* Sets the directory to be used for the temporary files created by
* the {@link #createTempFile()} method.
* the {@link #createTempFile(String)} method.
*
* @param tempFileDir temporary file directory,
* or <code>null</code> for the system default
Expand All @@ -63,7 +66,7 @@ public void setTemporaryFileDirectory(Path tempFileDir) {

/**
* Sets the directory to be used for the temporary files created by
* the {@link #createTempFile()} method.
* the {@link #createTempFile(String)} method.
*
* @param tempFileDir temporary file directory,
* or <code>null</code> for the system default
Expand All @@ -76,13 +79,15 @@ public void setTemporaryFileDirectory(File tempFileDir) {
/**
* Creates a temporary file that will automatically be deleted when
* the {@link #close()} method is called, returning its path.
*
* @param suffix -- the suffix of the file if known, starting with "." as in ".pdf"
* @return Path to created temporary file that will be deleted after closing
* @throws IOException
*/
public Path createTempFile() throws IOException {
final Path path = tempFileDir == null ? Files.createTempFile("apache-tika-", ".tmp") :
Files.createTempFile(tempFileDir, "apache-tika-", ".tmp");
public Path createTempFile(String suffix) throws IOException {
String actualSuffix = StringUtils.isBlank(suffix) ? ".tmp" : suffix;

final Path path = tempFileDir == null ? Files.createTempFile("apache-tika-", actualSuffix) :
Files.createTempFile(tempFileDir, "apache-tika-", actualSuffix);
addResource(() -> {
try {
Files.delete(path);
Expand All @@ -95,16 +100,34 @@ public Path createTempFile() throws IOException {
return path;
}

public Path createTempFile() throws IOException {
return createTempFile(StringUtils.EMPTY);
}

/**
* Creates a temporary file that will automatically be deleted when
* the {@link #close()} method is called, returning its path.
*
* @return Path to created temporary file that will be deleted after closing
* @throws IOException
*/
public Path createTempFile(Metadata metadata) throws IOException {
String resourceName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
if (StringUtils.isBlank(resourceName)) {
return createTempFile(StringUtils.EMPTY);
}
return createTempFile(FilenameUtils.getSuffixFromPath(resourceName));
}
/**
* Creates and returns a temporary file that will automatically be
* deleted when the {@link #close()} method is called.
*
* @return Created temporary file that'll be deleted after closing
* @throws IOException
* @see #createTempFile()
* @see #createTempFile(String)
*/
public File createTemporaryFile() throws IOException {
return createTempFile().toFile();
return createTempFile(StringUtils.EMPTY).toFile();
}

/**
Expand Down
50 changes: 40 additions & 10 deletions tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ public class TikaInputStream extends TaggedInputStream {
private int consecutiveEOFs = 0;
private byte[] skipBuffer;

//suffix of the file if known. This is used to create temp files
//with the right suffixes. This should include the initial . as in ".doc"
private String suffix = null;

/**
* Creates a TikaInputStream instance. This private constructor is used
* by the static factory methods based on the available information.
Expand All @@ -131,13 +135,15 @@ private TikaInputStream(Path path) throws IOException {
this.path = path;
this.tmp = new TemporaryResources();
this.length = Files.size(path);
this.suffix = FilenameUtils.getSuffixFromPath(path.getFileName().toString());
}

private TikaInputStream(Path path, TemporaryResources tmp, long length) throws IOException {
super(new BufferedInputStream(Files.newInputStream(path)));
this.path = path;
this.tmp = tmp;
this.length = length;
this.suffix = FilenameUtils.getSuffixFromPath(path.getFileName().toString());
}

/**
Expand All @@ -154,6 +160,8 @@ private TikaInputStream(File file) throws FileNotFoundException {
this.path = file.toPath();
this.tmp = new TemporaryResources();
this.length = file.length();
this.suffix = FilenameUtils.getSuffixFromPath(path.getFileName().toString());

}

/**
Expand All @@ -168,11 +176,12 @@ private TikaInputStream(File file) throws FileNotFoundException {
* @param tmp tracker for temporary resources associated with this stream
* @param length total length of the stream, or -1 if unknown
*/
private TikaInputStream(InputStream stream, TemporaryResources tmp, long length) {
private TikaInputStream(InputStream stream, TemporaryResources tmp, long length, String suffix) {
super(stream);
this.path = null;
this.tmp = tmp;
this.length = length;
this.suffix = suffix;
}

/**
Expand Down Expand Up @@ -215,7 +224,7 @@ public static boolean isTikaInputStream(InputStream stream) {
* @return a TikaInputStream instance
* @since Apache Tika 0.10
*/
public static TikaInputStream get(InputStream stream, TemporaryResources tmp) {
public static TikaInputStream get(InputStream stream, TemporaryResources tmp, Metadata metadata) {
if (stream == null) {
throw new NullPointerException("The Stream must not be null");
}
Expand All @@ -227,17 +236,28 @@ public static TikaInputStream get(InputStream stream, TemporaryResources tmp) {
if (!(stream.markSupported())) {
stream = new BufferedInputStream(stream);
}
return new TikaInputStream(stream, tmp, -1);
return new TikaInputStream(stream, tmp, -1, getExtension(metadata));
}
}

/**
* @deprecated use {@link TikaInputStream#get(InputStream, TemporaryResources, Metadata)}
* @param stream
* @param tmp
* @return
*/
@Deprecated
public static TikaInputStream get(InputStream stream, TemporaryResources tmp) {
return get(stream, tmp, null);
}

/**
* Casts or wraps the given stream to a TikaInputStream instance.
* This method can be used to access the functionality of this class
* even when given just a normal input stream instance.
* <p>
* Use this method instead of the
* {@link #get(InputStream, TemporaryResources)} alternative when you
* {@link #get(InputStream, TemporaryResources, Metadata)} alternative when you
* <em>do</em> explicitly close the returned stream. The recommended
* access pattern is:
* <pre>
Expand All @@ -254,7 +274,7 @@ public static TikaInputStream get(InputStream stream, TemporaryResources tmp) {
* @return a TikaInputStream instance
*/
public static TikaInputStream get(InputStream stream) {
return get(stream, new TemporaryResources());
return get(stream, new TemporaryResources(), null);
}

/**
Expand Down Expand Up @@ -301,7 +321,7 @@ public static TikaInputStream get(byte[] data) {
public static TikaInputStream get(byte[] data, Metadata metadata) {
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
return new TikaInputStream(new UnsynchronizedByteArrayInputStream(data), new TemporaryResources(),
data.length);
data.length, getExtension(metadata));
}

/**
Expand Down Expand Up @@ -414,7 +434,7 @@ public static TikaInputStream get(InputStreamFactory factory) throws IOException
*/
public static TikaInputStream get(InputStreamFactory factory, TemporaryResources tmp)
throws IOException {
TikaInputStream stream = get(factory.getInputStream(), tmp);
TikaInputStream stream = get(factory.getInputStream(), tmp, null);
stream.streamFactory = factory;
return stream;
}
Expand Down Expand Up @@ -451,6 +471,7 @@ public static TikaInputStream get(Blob blob) throws SQLException {
* @throws SQLException if BLOB data can not be accessed
*/
public static TikaInputStream get(Blob blob, Metadata metadata) throws SQLException {

long length = -1;
try {
length = blob.length();
Expand All @@ -465,8 +486,17 @@ public static TikaInputStream get(Blob blob, Metadata metadata) throws SQLExcept
return get(blob.getBytes(1, (int) length), metadata);
} else {
return new TikaInputStream(new BufferedInputStream(blob.getBinaryStream()),
new TemporaryResources(), length);
new TemporaryResources(), length,
getExtension(metadata));
}
}

private static String getExtension(Metadata metadata) {
if (metadata == null) {
return StringUtils.EMPTY;
}
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
return FilenameUtils.getSuffixFromPath(name);
}

/**
Expand Down Expand Up @@ -570,7 +600,7 @@ public static TikaInputStream get(URL url, Metadata metadata) throws IOException
}

return new TikaInputStream(new BufferedInputStream(connection.getInputStream()),
new TemporaryResources(), length);
new TemporaryResources(), length, getExtension(metadata));
}

/**
Expand Down Expand Up @@ -678,7 +708,7 @@ public Path getPath(int maxBytes) throws IOException {
if (position > 0) {
throw new IOException("Stream is already being read");
} else {
Path tmpFile = tmp.createTempFile();
Path tmpFile = tmp.createTempFile(suffix);
if (maxBytes > -1) {
try (InputStream lookAhead = new LookaheadInputStream(this, maxBytes)) {
Files.copy(lookAhead, tmpFile, REPLACE_EXISTING);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
}
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);

//figure out if we should spool to disk
maybeSpool(tis, autoDetectParserConfig, metadata);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
context.set(ParseRecord.class, parserRecord);
}
try {
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp, metadata);
TaggedContentHandler taggedHandler =
handler != null ? new TaggedContentHandler(handler) : null;
String parserClassname = ParserUtils.getParserClassname(parser);
Expand Down
Loading

0 comments on commit d9040f4

Please sign in to comment.