赞
踩
最近家人有个pdf 文档需要转成word ,我觉得很轻松就在网上搜索工具,竟然没有找到好用的工具,有几个像样的竟然需要银子??
pdf转换这么难吗?为什么需要花钱才能解决?强大的java,好用的apache 工具系列解决不了么 ?所以决定研究一下。
首先找到了apache pdf 解析的依赖包:
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox</artifactId>
- <version>2.0.4</version>
- </dependency>
- <dependency>
- <groupId>net.coobird</groupId>
- <artifactId>thumbnailator</artifactId>
- <version>0.4.8</version>
- </dependency>
另外一个是图片处理的包。
由于pdf 中包含图片和文字,所以我提取图片和文字添加到word 中就好,添加POI依赖。
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi</artifactId>
- <version>3.9</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-ooxml</artifactId>
- <version>3.9</version>
- </dependency>
然后贴一下代码:
- public class Pdf2word {
- public static void main(String[] args) throws InvalidFormatException {
-
- try {
- String pdfFileName = "H:\\xuweichao.pdf";
- PDDocument pdf = PDDocument.load(new File(pdfFileName));
- int pageNumber = pdf.getNumberOfPages();
-
- String docFileName = pdfFileName.substring(0, pdfFileName.lastIndexOf(".")) + ".doc";
-
- File file = new File(docFileName);
- if (!file.exists()) {
- file.createNewFile();
- }
- CustomXWPFDocument document = new CustomXWPFDocument();
- FileOutputStream fos = new FileOutputStream(docFileName);
-
- //提取每一页的图片和文字,添加到 word 中
- for (int i = 0; i < pageNumber; i++) {
-
- PDPage page = pdf.getPage(i);
- PDResources resources = page.getResources();
-
- Iterable<COSName> names = resources.getXObjectNames();
- Iterator<COSName> iterator = names.iterator();
- while (iterator.hasNext()) {
- COSName cosName = iterator.next();
-
- if (resources.isImageXObject(cosName)) {
- PDImageXObject imageXObject = (PDImageXObject) resources.getXObject(cosName);
- File outImgFile = new File("H:\\img\\" + System.currentTimeMillis() + ".jpg");
- Thumbnails.of(imageXObject.getImage()).scale(0.9).rotate(0).toFile(outImgFile);
-
-
- BufferedImage bufferedImage = ImageIO.read(outImgFile);
- int width = bufferedImage.getWidth();
- int height = bufferedImage.getHeight();
- if (width > 600) {
- double ratio = Math.round((double) width / 550.0);
- System.out.println("缩放比ratio:"+ratio);
- width = (int) (width / ratio);
- height = (int) (height / ratio);
-
- }
-
- System.out.println("width: " + width + ", height: " + height);
- FileInputStream in = new FileInputStream(outImgFile);
- byte[] ba = new byte[in.available()];
- in.read(ba);
- ByteArrayInputStream byteInputStream = new ByteArrayInputStream(ba);
-
- XWPFParagraph picture = document.createParagraph();
- //添加图片
- document.addPictureData(byteInputStream, CustomXWPFDocument.PICTURE_TYPE_JPEG);
- //图片大小、位置
- document.createPicture(document.getAllPictures().size() - 1, width, height, picture);
-
- }
- }
-
-
- PDFTextStripper stripper = new PDFTextStripper();
- stripper.setSortByPosition(true);
- stripper.setStartPage(i);
- stripper.setEndPage(i);
- //当前页中的文字
- String text = stripper.getText(pdf);
-
-
- XWPFParagraph textParagraph = document.createParagraph();
- XWPFRun textRun = textParagraph.createRun();
- textRun.setText(text);
- textRun.setFontFamily("仿宋");
- textRun.setFontSize(11);
- //换行
- textParagraph.setWordWrap(true);
- }
- document.write(fos);
- fos.close();
- pdf.close();
- System.out.println("pdf转换解析结束!!----");
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
自定义文档类:
- public class CustomXWPFDocument extends XWPFDocument {
- public CustomXWPFDocument(InputStream in) throws IOException {
- super(in);
- }
-
- public CustomXWPFDocument() {
- super();
- }
-
- public CustomXWPFDocument(OPCPackage pkg) throws IOException {
- super(pkg);
- }
-
- /**
- * @param id
- * @param width
- * 宽
- * @param height
- * 高
- * @param paragraph
- * 段落
- */
- public void createPicture(int id, int width, int height,
- XWPFParagraph paragraph) {
- final int EMU = 9525;
- width *= EMU;
- height *= EMU;
- String blipId = getAllPictures().get(id).getPackageRelationship()
- .getId();
- CTInline inline = paragraph.createRun().getCTR().addNewDrawing()
- .addNewInline();
- String picXml = ""
- + "<a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">"
- + " <a:graphicData uri=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">"
- + " <pic:pic xmlns:pic=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">"
- + " <pic:nvPicPr>" + " <pic:cNvPr id=\""
- + id
- + "\" name=\"Generated\"/>"
- + " <pic:cNvPicPr/>"
- + " </pic:nvPicPr>"
- + " <pic:blipFill>"
- + " <a:blip r:embed=\""
- + blipId
- + "\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\"/>"
- + " <a:stretch>"
- + " <a:fillRect/>"
- + " </a:stretch>"
- + " </pic:blipFill>"
- + " <pic:spPr>"
- + " <a:xfrm>"
- + " <a:off x=\"0\" y=\"0\"/>"
- + " <a:ext cx=\""
- + width
- + "\" cy=\""
- + height
- + "\"/>"
- + " </a:xfrm>"
- + " <a:prstGeom prst=\"rect\">"
- + " <a:avLst/>"
- + " </a:prstGeom>"
- + " </pic:spPr>"
- + " </pic:pic>"
- + " </a:graphicData>" + "</a:graphic>";
-
- inline.addNewGraphic().addNewGraphicData();
- XmlToken xmlToken = null;
- try {
- xmlToken = XmlToken.Factory.parse(picXml);
- } catch (XmlException xe) {
- xe.printStackTrace();
- }
- inline.set(xmlToken);
-
- inline.setDistT(0);
- inline.setDistB(0);
- inline.setDistL(0);
- inline.setDistR(0);
-
- CTPositiveSize2D extent = inline.addNewExtent();
- extent.setCx(width);
- extent.setCy(height);
-
- CTNonVisualDrawingProps docPr = inline.addNewDocPr();
- docPr.setId(id);
- docPr.setName("图片名称");
- docPr.setDescr("描述信息");
- }
- }
程序就这么简单,遍历文件每一页抽取到pdf 中的图片和文字,文字样式问题暂未解决,但是生成的word 文件对大图片做了比例缩小,对于排版简单的pdf 文件效果还是不错的。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。