当前位置: 首页 > news >正文

Java实现pdf/word文字识别,调用OCR提取图片文字聚合

@Tag(name = "pdf/word/图片文字识别")
public class OcrController extends BaseController {@Autowiredprivate OcrService ocrService;@Autowiredprivate BaiduOcrServiceImpl baiduOcrService;/*** pdf/word文字识别** @param file* @return*/@PostMapping("/recognize-text")@Operation(summary = "pdf/word识别文字", description = "识别")public String recognizeText(@RequestParam("file") MultipartFile file) {return ocrService.recognizeText(file);}}
package com.jt.console.service.impl;import com.jt.common.beans.ServiceAssert;
import com.jt.console.service.OcrService;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.xwpf.usermodel.*;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URLEncoder;
import java.util.Base64;import static com.jt.console.service.impl.BaiduOcrServiceImpl.formatOcrResult;/*** pdf/word/图片识别* @author chenchao* @date 2024/8/12 16:17*/
@Service
public class OcrServiceImpl implements OcrService {@Autowiredprivate BaiduOcrServiceImpl baiduOcrService;/*** 对于一些表格和公式的处理会有识别错乱问题* 识别上传文件中的文本内容* @param file 上传的文件* @return 提取的文本内容或错误信息*/@Overridepublic String recognizeText(MultipartFile file) {String contentType = file.getContentType();if (contentType == null) {ServiceAssert.isTrue(false, "文件类型不支持");return null;}InputStream inputStream = null;try {inputStream = file.getInputStream();if (contentType.equals("application/pdf")) {return extractTextFromPdf(inputStream);} else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ||contentType.equals("application/x-tika-ooxml")) {return extractTextFromDocx(inputStream);} else if (contentType.equals("application/msword")) {return extractTextFromDoc(inputStream);} else {ServiceAssert.isTrue(false, "不支持的文件类型");return null;}} catch (Exception e) {e.printStackTrace();ServiceAssert.isTrue(false, "处理文件出错");return null;} finally {if (inputStream != null) {try {inputStream.close();} catch (IOException e) {e.printStackTrace();}}}}/*** 从 PDF 文件中提取文本内容* @param inputStream PDF 文件的输入流* @return 提取的文本内容* @throws IOException 读取文件时发生的异常*/private String extractTextFromPdf(InputStream inputStream) throws IOException {StringBuilder text = new StringBuilder();try (PDDocument document = PDDocument.load(inputStream)) {// 禁止显示与 CMap 表相关的特定警告System.setProperty("org.apache.pdfbox.logging.SILENT", "true");PDFTextStripper pdfStripper = new PDFTextStripper();text.append(pdfStripper.getText(document));// 如果您需要从 PDF 中提取图像,请取消注释下面的行// extractImagesFromPdf(document);}return text.toString();}/*** 从 DOCX 文件中提取文本内容* @param inputStream DOCX 文件的输入流* @return 提取的文本内容* @throws IOException 读取文件时发生的异常*/private String extractTextFromDocx(InputStream inputStream) throws IOException {StringBuilder text = new StringBuilder();ZipSecureFile.setMinInflateRatio(0.001); // For safetytry (XWPFDocument document = new XWPFDocument(inputStream)) {// Extract text from paragraphsdocument.getParagraphs().forEach(paragraph -> text.append(paragraph.getText()).append("\n"));// Extract text from tablesfor (XWPFTable table : document.getTables()) {for (XWPFTableRow row : table.getRows()) {for (XWPFTableCell cell : row.getTableCells()) {text.append(cell.getText()).append("\t");}text.append("\n");}}// 如果您需要从 DOCX 中提取图像,请取消注释下面的行// extractImagesFromDocx(document);}return text.toString();}/*** 从 DOC 文件中提取文本内容* @param inputStream DOC 文件的输入流* @return 提取的文本内容* @throws IOException 读取文件时发生的异常*/private String extractTextFromDoc(InputStream inputStream) throws IOException {StringBuilder text = new StringBuilder();try (HWPFDocument document = new HWPFDocument(inputStream)) {WordExtractor extractor = new WordExtractor(document);String[] paragraphs = extractor.getParagraphText();for (String paragraph : paragraphs) {text.append(paragraph).append("\n");}}return text.toString();}/*** 从 PDF 文件中提取图片* @param document PDF 文档对象* @throws IOException 读取文件时发生的异常*/private void extractImagesFromPdf(PDDocument document) throws IOException {PDPageTree pages = document.getPages();int imageCounter = 0;for (PDPage page : pages) {PDResources resources = page.getResources();for (COSName xObjectName : resources.getXObjectNames()) {PDXObject xObject = resources.getXObject(xObjectName);if (xObject instanceof PDImageXObject) {PDImageXObject image = (PDImageXObject) xObject;BufferedImage bufferedImage = image.getImage();// Save image to fileFile imageFile = new File("image" + (++imageCounter) + ".png");try (FileOutputStream fos = new FileOutputStream(imageFile)) {ImageIO.write(bufferedImage, "PNG", fos);}}}}}/*** 从 DOCX 文件中提取图片* @param document DOCX 文档对象* @throws IOException 读取文件时发生的异常*/public String extractImagesFromDocx(XWPFDocument document, boolean urlEncode) throws IOException {StringBuilder recognitionResults = new StringBuilder();int imageCounter = 0;for (XWPFPictureData pictureData : document.getAllPictures()) {byte[] bytes = pictureData.getData();// 将图片数据转换为 Base64 编码String base64Image = Base64.getEncoder().encodeToString(bytes);// 如果需要 URL 编码if (urlEncode) {base64Image = URLEncoder.encode(base64Image, "utf-8");}// 识别图片String ocrResult = baiduOcrService.recognizeImage(base64Image);String formattedResult = formatOcrResult(ocrResult);recognitionResults.append("Image ").append(++imageCounter).append(": ").append(formattedResult).append("\n");}return recognitionResults.toString();}}
package com.jt.console.service.impl;import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.jt.common.beans.ServiceAssert;
import okhttp3.*;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;import java.io.IOException;
import java.net.URLEncoder;
import java.util.Base64;
import java.util.List;
import java.util.Arrays;/*** 百度OCR识别实现类*/
@Service("baiduOcrServiceImpl")
public class BaiduOcrServiceImpl {@Value("${baidu.ocr.apiKey}")private String API_KEY;  // 客户端id@Value("${baidu.ocr.secretKey}")private String SECRET_KEY; // 客户端秘钥// 支持的图片格式列表private static final List<String> SUPPORTED_FORMATS = Arrays.asList("png", "jpg", "jpeg", "bmp", "gif");// 构建 OkHttpClient 实例private static final OkHttpClient HTTP_CLIENT = new OkHttpClient().newBuilder().build();// 获取 Access Tokenprivate String getAccessToken() throws IOException {MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");RequestBody body = RequestBody.create(mediaType, "grant_type=client_credentials&client_id=" + API_KEY+ "&client_secret=" + SECRET_KEY);Request request = new Request.Builder().url("https://aip.baidubce.com/oauth/2.0/token").method("POST", body).addHeader("Content-Type", "application/x-www-form-urlencoded").build();Response response = HTTP_CLIENT.newCall(request).execute();if (!response.isSuccessful()) {//throw new IOException("Unexpected code " + response);// 自定义提示信息String errorMessage = "OCR request failed. Status code: " + response.code() + ", Message: " + response.message();ServiceAssert.isTrue(false, errorMessage);}String responseBody = response.body().string();JSONObject jsonObject = JSON.parseObject(responseBody);return jsonObject.getString("access_token");}// 调用 OCR 接口,返回结果public String recognizeImage(String base64Image) throws IOException {MediaType mediaType = MediaType.parse("application/x-www-form-urlencoded");RequestBody body = RequestBody.create(mediaType, "image=" + base64Image + "&detect_direction=false&paragraph=false&probability=false");Request request = new Request.Builder().url("https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + getAccessToken()).method("POST", body).addHeader("Content-Type", "application/x-www-form-urlencoded").addHeader("Accept", "application/json").build();try (Response response = HTTP_CLIENT.newCall(request).execute()) {if (!response.isSuccessful()) {//throw new IOException("Unexpected code " + response);// 自定义提示信息String errorMessage = "Failed to obtain access token. Status code: " + response.code() + ", Message: " + response.message();ServiceAssert.isTrue(false, errorMessage);}return formatOcrResult(response.body().string());}}// 将 MultipartFile 转换为 Base64 编码public String convertToBase64(MultipartFile file, boolean urlEncode) throws IOException {// 检查图片格式String filename = file.getOriginalFilename();if (filename == null) {ServiceAssert.isTrue(false, "文件名为空");}String extension = filename.substring(filename.lastIndexOf('.') + 1).toLowerCase();if (!SUPPORTED_FORMATS.contains(extension)) {ServiceAssert.isTrue(false, "不支持的图片格式: " + extension);}// 从 MultipartFile 中获取字节数组byte[] bytes = file.getBytes();// 将字节数组编码为 Base64 字符串String base64 = Base64.getEncoder().encodeToString(bytes);// 如果需要 URL 编码if (urlEncode) {base64 = URLEncoder.encode(base64, "utf-8");}return base64;}//组装返回OCR识别的结果public static String formatOcrResult(String ocrResult) {StringBuilder resultText = new StringBuilder();try {// 解析 OCR 结果JSONObject jsonObject = JSON.parseObject(ocrResult);// 检查是否包含 words_result 数组if (jsonObject.containsKey("words_result")) {var wordsResult = jsonObject.getJSONArray("words_result");if (wordsResult != null && !wordsResult.isEmpty()) {for (int i = 0; i < wordsResult.size(); i++) {JSONObject wordObject = wordsResult.getJSONObject(i);String word = wordObject.getString("words");if (word != null && !word.isEmpty()) {resultText.append(word).append(" ");}}} else {// 如果没有识别到文字,直接返回空值return "";}} else {// OCR 结果中不包含 words_result,也返回空值return "";}} catch (Exception e) {ServiceAssert.isTrue(false,e.getMessage());//resultText.append("处理 OCR 结果时出错:").append(e.getMessage());}return resultText.toString().trim();}
}

相关文章:

  • 北京网站建设多少钱?
  • 辽宁网页制作哪家好_网站建设
  • 高端品牌网站建设_汉中网站制作
  • 厦门商家微信小程序、抖音、支付宝小程序同步上线
  • C语言宏中“#”和“##”的用法
  • 优先级队列的实现
  • 【uniapp】vue3+vite配置tailwindcss
  • 力扣热题100_链表_234_回文链表
  • ubuntu设置共享文件夹,非虚拟机,服务器版
  • XSS DOM漏洞复现 与DOM 破坏
  • ARM/Linux嵌入式面经(二四):国光电器
  • 雷达气象学(9)——反射率因子图分析(强对流篇)
  • 二十、观察者模式
  • 在postman设置请求里带动态token,看看这两种方法!
  • Python接口自动化之unittest单元测试
  • 深入理解指针(五)
  • 分享一个基于SpringBoot的戏剧戏曲科普平台的设计与实现(源码、调试、LW、开题、PPT)
  • 【观察者模式】设计模式系列: 实现与最佳实践案例分析
  • 【399天】跃迁之路——程序员高效学习方法论探索系列(实验阶段156-2018.03.11)...
  • Angular6错误 Service: No provider for Renderer2
  • C# 免费离线人脸识别 2.0 Demo
  • gcc介绍及安装
  • GraphQL学习过程应该是这样的
  • HTTP中的ETag在移动客户端的应用
  • Just for fun——迅速写完快速排序
  • node和express搭建代理服务器(源码)
  • PHP 7 修改了什么呢 -- 2
  • 基于OpenResty的Lua Web框架lor0.0.2预览版发布
  • 排序算法学习笔记
  • 排序算法之--选择排序
  • 让你的分享飞起来——极光推出社会化分享组件
  • 赢得Docker挑战最佳实践
  • 优化 Vue 项目编译文件大小
  • 由插件封装引出的一丢丢思考
  • 白色的风信子
  • ​linux启动进程的方式
  • ​如何使用ArcGIS Pro制作渐变河流效果
  • #define、const、typedef的差别
  • #etcd#安装时出错
  • (13)[Xamarin.Android] 不同分辨率下的图片使用概论
  • (22)C#传智:复习,多态虚方法抽象类接口,静态类,String与StringBuilder,集合泛型List与Dictionary,文件类,结构与类的区别
  • (c语言)strcpy函数用法
  • (PADS学习)第二章:原理图绘制 第一部分
  • (solr系列:一)使用tomcat部署solr服务
  • (vue)页面文件上传获取:action地址
  • (第8天)保姆级 PL/SQL Developer 安装与配置
  • (二刷)代码随想录第15天|层序遍历 226.翻转二叉树 101.对称二叉树2
  • (附源码)spring boot网络空间安全实验教学示范中心网站 毕业设计 111454
  • (机器学习-深度学习快速入门)第三章机器学习-第二节:机器学习模型之线性回归
  • (论文阅读26/100)Weakly-supervised learning with convolutional neural networks
  • (已解决)Bootstrap精美弹出框模态框modal,实现js向modal传递数据
  • (已解决)vue+element-ui实现个人中心,仿照原神
  • (原創) 如何將struct塞進vector? (C/C++) (STL)
  • (转)Sublime Text3配置Lua运行环境
  • .jks文件(JAVA KeyStore)
  • .L0CK3D来袭:如何保护您的数据免受致命攻击
  • .MyFile@waifu.club.wis.mkp勒索病毒数据怎么处理|数据解密恢复
  • .NET HttpWebRequest、WebClient、HttpClient