Apache tika 实现各种文档内容解析
1、依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>cn.js</groupId><artifactId>TikaResouce</artifactId><version>1.0-SNAPSHOT</version><properties><maven.compiler.source>8</maven.compiler.source><maven.compiler.target>8</maven.compiler.target><project.build.sourceEncoding>UTF-8</project.build.sourceEncoding></properties><parent><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-parent</artifactId><version>2.7.0</version></parent><dependencyManagement><dependencies><dependency><groupId>org.apache.tika</groupId><artifactId>tika-bom</artifactId><version>2.8.0</version><type>pom</type><scope>import</scope></dependency></dependencies></dependencyManagement><dependencies><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-web</artifactId></dependency><dependency><groupId>commons-fileupload</groupId><artifactId>commons-fileupload</artifactId><version>1.4</version></dependency><dependency><groupId>org.apache.tika</groupId><artifactId>tika-core</artifactId></dependency><dependency><groupId>org.apache.tika</groupId><artifactId>tika-parsers-standard-package</artifactId></dependency></dependencies></project>
2、配置文件
新建一个 tika-config.xml 文件
<?xml version="1.0" encoding="UTF-8"?>
<properties><encodingDetectors><encodingDetector class="org.apache.tika.parser.html.HtmlEncodingDetector"><params><param name="markLimit" type="int">64000</param></params></encodingDetector><encodingDetector class="org.apache.tika.parser.txt.UniversalEncodingDetector"><params><param name="markLimit" type="int">64001</param></params></encodingDetector><encodingDetector class="org.apache.tika.parser.txt.Icu4jEncodingDetector"><params><param name="markLimit" type="int">64002</param></params></encodingDetector></encodingDetectors>
</properties>
3、配置类
package cn.js.config;import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.core.io.Resource;
import org.springframework.core.io.ResourceLoader;
import org.xml.sax.SAXException;
@Configuration
public class MyTikaConfig {@Autowiredprivate ResourceLoader resourceLoader;@Beanpublic Tika tika() throws TikaException, IOException, SAXException {Resource resource = resourceLoader.getResource("classpath:tika-config.xml");InputStream inputStream = resource.getInputStream();TikaConfig config = new TikaConfig(inputStream);Detector detector = config.getDetector();Parser autoDetectParser = new AutoDetectParser(config);return new Tika(detector, autoDetectParser);}
}
controller
package cn.js.controller;import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.springframework.http.HttpRequest;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;import javax.annotation.Resource;
import java.io.IOException;
import java.io.InputStream;@RestController
@RequestMapping("/tika")
public class TikaController {@Resourceprivate Tika tika;@PostMapping("/pdf")public void TikaDemon(@RequestParam("file") MultipartFile file) throws IOException, TikaException {InputStream inputStream = file.getInputStream();String s = tika.parseToString(inputStream);System.out.println(s);}}