2019独角兽企业重金招聘Python工程师标准>>>
今天写了一段获取MIME类型的代码,对比用org.apache.tika和net.sf.jmimemagic。
jdk版本是1.8.
1.pom.xml:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>hui</groupId>
<artifactId>TestWithMaven</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>TestWithMaven</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.13</version>
</dependency>
<dependency>
<groupId>org.apache.ibatis</groupId>
<artifactId>ibatis-core</artifactId>
<version>3.0</version>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.4.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-core</artifactId>
<version>1.3</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
<version>4.2.2.RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.13</version>
</dependency>
<dependency>
<groupId>net.sf.jmimemagic</groupId>
<artifactId>jmimemagic</artifactId>
<version>0.1.4</version>
</dependency>
<dependency>
<groupId>xml-apis</groupId>
<artifactId>xmlParserAPIs</artifactId>
<version>2.0.2</version>
</dependency>
</dependencies>
</project>
2.FileUtils.java:
package mime;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.FileNameMap;
import java.net.URLConnection;
import java.net.URLEncoder;
import javax.activation.MimetypesFileTypeMap;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.springframework.mail.javamail.ConfigurableMimeFileTypeMap;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import net.sf.jmimemagic.Magic;
import net.sf.jmimemagic.MagicException;
import net.sf.jmimemagic.MagicMatch;
import net.sf.jmimemagic.MagicMatchNotFoundException;
import net.sf.jmimemagic.MagicParseException;
public class FileUtils {
public static String getMimeTypeByFileTypeMap(String path) {
MimetypesFileTypeMap mimetypesFileTypeMap = new MimetypesFileTypeMap();
// 默认没有pdf的,如果传入pdf的,会默认application/octet-stream,也没有application/xml
mimetypesFileTypeMap.addMimeTypes("application/pdf pdf");
File f = new File(path);
return mimetypesFileTypeMap.getContentType(f);
}
public static String getMimeTypeByFileTypeMap2(String path) {
String mimeType = null;
int idx = path.lastIndexOf('.');
if (idx == -1) {
mimeType = "application/octet-stream";
} else {
String fileExtension = path.substring(idx).toLowerCase();
if (fileExtension.equals(".html")) {
mimeType = "text/html";
} else if (fileExtension.equals(".css")) {
mimeType = "text/css";
} else if (fileExtension.equals(".js")) {
mimeType = "application/javascript";
} else if (fileExtension.equals(".gif")) {
mimeType = "image/gif";
} else if (fileExtension.equals(".png")) {
mimeType = "image/png";
} else if (fileExtension.equals(".txt")) {
mimeType = "text/plain";
} else if (fileExtension.equals(".xml")) {
mimeType = "application/xml";
} else if (fileExtension.equals(".json")) {
mimeType = "application/json";
} else {
MimetypesFileTypeMap mimeTypesMap = new MimetypesFileTypeMap();
mimeType = mimeTypesMap.getContentType(path);
}
}
return mimeType;
}
public static String getMimeTypeBySpring(String path) {
ConfigurableMimeFileTypeMap mimeMap = new ConfigurableMimeFileTypeMap();
// 没有application/xml
String contentType = mimeMap.getContentType(path);
return contentType;
}
public static String getMimeByFileNameMap(String fileUrl) {
FileNameMap fileNameMap = URLConnection.getFileNameMap();
try {
String mimeType = fileNameMap
.getContentTypeFor(URLEncoder.encode(fileUrl, "UTF-8"));
if (mimeType == null) {
mimeType = "application/octet-stream";
}
return mimeType;
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return "";
}
}
public static String getMimeByTika(String fileUrl) {
String mimeType = null;
try {
ContentHandler contenthandler = new BodyContentHandler();
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_ENCODING, "utf-8");
metadata.set(Metadata.RESOURCE_NAME_KEY, fileUrl);
// Parser parser = new DefaultParser();获取不到MIME类型
Parser parser = new AutoDetectParser();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
parser.parse(new FileInputStream(fileUrl), contenthandler, metadata, context);
for (String name : metadata.names()) {
System.out.println(name);
}
mimeType = metadata.get(Metadata.CONTENT_TYPE);
} catch (IOException | TikaException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
}
return mimeType;
}
public static String getMimeByJMimeMagic(String fileUrl) {
MagicMatch match;
try {
match = Magic.getMagicMatch(new File(fileUrl), true);
return match.getMimeType();
} catch (MagicParseException | MagicMatchNotFoundException | MagicException e) {
e.printStackTrace();
}
return "";
}
}
3.MIMETest.java:
package mime;
public class MIMETest {
public static void main(String[] args) {
// src目录
// String fileName = "funds.properties";
String fileName = "createPerson.sql";
// String path = getPath(fileName);
// 绝对路径
// String path = "E:/test/process.txt";
// String path = "E:/test/02.jpg";
// String path = "E:/Anheng/receiver-design.pdf";
// String path = "E:/api/dom4j.chm";
// String path = "E:/eclipse/ajax/pom.xml";
// String path = "E:/test/person.json";
// String path = "E:/test/file.java";
// String path = "E:/test/static.ftl";
// String path = "E:/test/rest.jerseySpring.war";
// String path = "E:/test/upload/myeclipse.exe";
String path = "E:/test/upload/myeclipse.ini";
System.out.println("getMimeTypeByFileTypeMap: Mime Type of " + path + " is "
+ FileUtils.getMimeTypeByFileTypeMap(path));
System.out.println("getMimeTypeByFileTypeMap2: Mime Type of " + path + " is "
+ FileUtils.getMimeTypeByFileTypeMap2(path));
System.out.println("getMimeTypeBySpring: Mime Type of " + path + " is "
+ FileUtils.getMimeTypeBySpring(path));
System.out.println("getMimeByFileNameMap: Mime Type of " + path + " is "
+ FileUtils.getMimeByFileNameMap(path));
/* Tika会检查路径的合法性;
而且properties文件会返回text/x-java-properties,以上只会返回application/octet-stream;
.sql文件会返回text/x-sql,以上几种也是只会返回application/octet-stream;
.json文件会返回application/json,以上几种除非添加了这一类型,不然返回application/octet-stream;
.java文件会返回text/x-java-source,前两种返回application/octet-stream,后两种返回text/plain;
.ftl文件会返回text/html,以上返回application/octet-stream;
.war文件会返回application/x-tika-java-web-archive,以上返回application/octet-stream;
.exe文件会返回application/x-dosexec,以上返回application/octet-stream;
.ini文件会返回text/x-ini,以上返回application/octet-stream;
*/
System.out.println("getMimeByTika: Mime Type of " + path + " is "
+ FileUtils.getMimeByTika(path));
System.out.println("getMimeByJMimeMagic: Mime Type of " + path + " is "
+ FileUtils.getMimeByJMimeMagic(path));
}
private static String getPath(String fileName) {
String prefix = System.getProperty("user.dir");
String fileSeparator = System.getProperty("file.separator");
String sourcePath = fileSeparator + "src" + fileSeparator + "main" + fileSeparator
+ "resources" + fileSeparator;
String path = prefix + sourcePath + fileName;
return path;
}
}
原本只测试Tika,即不加入jmimemagic的依赖时,测试正常,后来加入jmimemagic依赖,报错如下:
Exception in thread "main" java.lang.RuntimeException: Unable to parse the default media type registry
at org.apache.tika.mime.MimeTypes.getDefaultMimeTypes(MimeTypes.java:580)
at org.apache.tika.config.TikaConfig.getDefaultMimeTypes(TikaConfig.java:69)
at org.apache.tika.config.TikaConfig.<init>(TikaConfig.java:218)
at org.apache.tika.config.TikaConfig.getDefaultConfig(TikaConfig.java:341)
at org.apache.tika.parser.AutoDetectParser.<init>(AutoDetectParser.java:51)
at mime.FileUtils.getMimeByTika(FileUtils.java:103)
at mime.MIMETest.main(MIMETest.java:48)
Caused by: org.apache.tika.mime.MimeTypeException: Invalid type configuration
at org.apache.tika.mime.MimeTypesReader.read(MimeTypesReader.java:126)
at org.apache.tika.mime.MimeTypesFactory.create(MimeTypesFactory.java:64)
at org.apache.tika.mime.MimeTypesFactory.create(MimeTypesFactory.java:93)
at org.apache.tika.mime.MimeTypesFactory.create(MimeTypesFactory.java:170)
at org.apache.tika.mime.MimeTypes.getDefaultMimeTypes(MimeTypes.java:577)
... 6 more
Caused by: org.xml.sax.SAXNotRecognizedException: http://javax.xml.XMLConstants/feature/secure-processing
at org.apache.xerces.parsers.AbstractSAXParser.setFeature(Unknown Source)
at org.apache.xerces.jaxp.SAXParserImpl.setFeatures(Unknown Source)
at org.apache.xerces.jaxp.SAXParserImpl.<init>(Unknown Source)
at org.apache.xerces.jaxp.SAXParserFactoryImpl.newSAXParserImpl(Unknown Source)
at org.apache.xerces.jaxp.SAXParserFactoryImpl.setFeature(Unknown Source)
at org.apache.tika.mime.MimeTypesReader.read(MimeTypesReader.java:119)
... 10 more
按照错误提示,在FileUtils.java:103即getMimeByTika方法下的Parser parser = new AutoDetectParser();处打断点,在加net.sf.jmimemagi依赖前后对比异常原因,发现了下面一个现象 :
加net.sf.jmimemagi 前,javax.xml.parsers.SAXParserFactory的子类是com.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl,该类在jdk自带jar包
rt.jar-->com.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl.class下,如图:
而加net.sf.jmimemagi后,javax.xml.parsers.SAXParserFactory的子类变成了
xercesImpl-2.2.4.0.jar>xercom.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl.class,
如图:
该类在setFeature()时抛出了异常。即有两个相同名称的实现类,导致冲突报了异常。故我们将xercesImpl-2.2.4.0.jar排除掉即可,修改后的pom.xml如下所示:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>hui</groupId>
<artifactId>TestWithMaven</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>TestWithMaven</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.9.2</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.13</version>
</dependency>
<dependency>
<groupId>org.apache.ibatis</groupId>
<artifactId>ibatis-core</artifactId>
<version>3.0</version>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.4.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-core</artifactId>
<version>1.3</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context-support</artifactId>
<version>4.2.2.RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.13</version>
</dependency>
<dependency>
<groupId>net.sf.jmimemagic</groupId>
<artifactId>jmimemagic</artifactId>
<version>0.1.4</version>
<exclusions>
<exclusion>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>xml-apis</groupId>
<artifactId>xmlParserAPIs</artifactId>
<version>2.0.2</version>
</dependency>
</dependencies>
</project>
至此,再运行,则各方法都不再抛异常。