时间:2021-05-19
1、因为最近有这方面的需求,用过之后记录一下。
2、此功能跟PDF中Ctrl+F性质一样,如果PDF中为图片形式的不支持定位到关键字。
import com.itextpdf.awt.geom.Rectangle2D.Float;import com.itextpdf.text.pdf.PdfDictionary;import com.itextpdf.text.pdf.PdfName;import com.itextpdf.text.pdf.PdfReader;import com.itextpdf.text.pdf.parser.*;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.util.ArrayList;import java.util.List;/** * 消失的太阳 */public class MyTest { public static void main(String[] args) throws IOException { //1.给定文件 File pdfFile = new File("D://test.pdf"); //2.定义一个byte数组,长度为文件的长度 byte[] pdfData = new byte[(int) pdfFile.length()]; //3.IO流读取文件内容到byte数组 FileInputStream inputStream = null; try { inputStream = new FileInputStream(pdfFile); inputStream.read(pdfData); } catch (IOException e) { throw e; } finally { if (inputStream != null) { try { inputStream.close(); } catch (IOException e) { } } } //4.指定关键字 String keyword = "消失的太阳:"; //5.调用方法,给定关键字和文件 List<float[]> positions = findKeywordPostions(pdfData, keyword); //6.返回值类型是 List<float[]> 每个list元素代表一个匹配的位置,分别为 float[0]所在页码 float[1]所在x轴 float[2]所在y轴 System.out.println("total:" + positions.size()); if (positions != null && positions.size() > 0) { for (float[] position : positions) { System.out.print("pageNum: " + (int) position[0]); System.out.print("\tx: " + position[1]); System.out.println("\ty: " + position[2]); } } } /** * findKeywordPostions * @param pdfData 通过IO流 PDF文件转化的byte数组 * @param keyword 关键字 * @return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y * @throws IOException */ public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException { List<float[]> result = new ArrayList<>(); List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData); for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) { List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition); if (charPositions == null || charPositions.size() < 1) { continue; } result.addAll(charPositions); } return result; } private static List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException { PdfReader reader = new PdfReader(pdfData); List<PdfPageContentPositions> result = new ArrayList<>(); int pages = reader.getNumberOfPages(); for (int pageNum = 1; pageNum <= pages; pageNum++) { float width = reader.getPageSize(pageNum).getWidth(); float height = reader.getPageSize(pageNum).getHeight(); PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height); //解析pdf,定位位置 PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener); PdfDictionary pageDic = reader.getPageN(pageNum); PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES); try { processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic); } catch (IOException e) { reader.close(); throw e; } String content = pdfRenderListener.getContent(); List<CharPosition> charPositions = pdfRenderListener.getcharPositions(); List<float[]> positionsList = new ArrayList<>(); for (CharPosition charPosition : charPositions) { float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()}; positionsList.add(positions); } PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions(); pdfPageContentPositions.setContent(content); pdfPageContentPositions.setPostions(positionsList); result.add(pdfPageContentPositions); } reader.close(); return result; } private static List<float[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) { List<float[]> result = new ArrayList<>(); String content = pdfPageContentPositions.getContent(); List<float[]> charPositions = pdfPageContentPositions.getPositions(); for (int pos = 0; pos < content.length(); ) { int positionIndex = content.indexOf(keyword, pos); if (positionIndex == -1) { break; } float[] postions = charPositions.get(positionIndex); result.add(postions); pos = positionIndex + 1; } return result; } private static class PdfPageContentPositions { private String content; private List<float[]> positions; public String getContent() { return content; } public void setContent(String content) { this.content = content; } public List<float[]> getPositions() { return positions; } public void setPostions(List<float[]> positions) { this.positions = positions; } } private static class PdfRenderListener implements RenderListener { private int pageNum; private float pageWidth; private float pageHeight; private StringBuilder contentBuilder = new StringBuilder(); private List<CharPosition> charPositions = new ArrayList<>(); public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) { this.pageNum = pageNum; this.pageWidth = pageWidth; this.pageHeight = pageHeight; } public void beginTextBlock() { } public void renderText(TextRenderInfo renderInfo) { List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos(); for (TextRenderInfo textRenderInfo : characterRenderInfos) { String word = textRenderInfo.getText(); if (word.length() > 1) { word = word.substring(word.length() - 1, word.length()); } Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange(); float x = (float)rectangle.getX(); float y = (float)rectangle.getY();// float x = (float)rectangle.getCenterX();// float y = (float)rectangle.getCenterY();// double x = rectangle.getMinX();// double y = rectangle.getMaxY(); //这两个是关键字在所在页面的XY轴的百分比 float xPercent = Math.round(x / pageWidth * 10000) / 10000f; float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;// CharPosition charPosition = new CharPosition(pageNum, xPercent, yPercent); CharPosition charPosition = new CharPosition(pageNum, (float)x, (float)y); charPositions.add(charPosition); contentBuilder.append(word); } } public void endTextBlock() { } public void renderImage(ImageRenderInfo renderInfo) { } public String getContent() { return contentBuilder.toString(); } public List<CharPosition> getcharPositions() { return charPositions; } } private static class CharPosition { private int pageNum = 0; private float x = 0; private float y = 0; public CharPosition(int pageNum, float x, float y) { this.pageNum = pageNum; this.x = x; this.y = y; } public int getPageNum() { return pageNum; } public float getX() { return x; } public float getY() { return y; } @Override public String toString() { return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]"; } }}总结
以上所述是小编给大家介绍的java实现查找PDF关键字所在页码及其坐标,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对网站的支持!
如果你觉得本文对你有帮助,欢迎转载,烦请注明出处,谢谢!
声明:本页内容来源网络,仅供用户参考;我单位不保证亦不表示资料全面及准确无误,也不保证亦不表示这些资料为最新信息,如因任何原因,本网内容或者用户因倚赖本网内容造成任何损失或损害,我单位将不会负任何法律责任。如涉及版权问题,请提交至online#300.cn邮箱联系删除。
有些时候我们在显示一篇文章的时候,可能需要把某些关键字进行套红,高亮显示,以便我们能快速的查找和定位这些关键字,下面让我们看看具体实现的代码。/***关键字套红
wps查找pdf关键字,其方法是: 1、首先打开pdf文档,随后单击左侧窗格上的搜索和标记按钮,或打开【查看】菜单并单击搜索,搜索面板将在屏幕的左侧打开。
下面我们看看如何在PDF文件中查找关键字。软件名称:迅捷PDF编辑器v2.1.0.0官方最新安装版软件大小:1.15MB更新时间:2019-08-30立即下载1
本文实例为大家分享了java实现查找替换功能的具体代码,供大家参考,具体内容如下查找if(searchTxt.getText().equals("")){JOp
前言在Java中,Java中volatile关键字十分重要本文全面&详细解析volatile关键字,希望你们会喜欢目录1.定义Java中的1个关键字/修饰符2.