当前位置:   article > 正文

用java五分钟 pdf转word 轻松搞定_java pdf文件转word为空文件

java pdf文件转word为空文件

最近家人有个pdf 文档需要转成word ,我觉得很轻松就在网上搜索工具,竟然没有找到好用的工具,有几个像样的竟然需要银子??

pdf转换这么难吗?为什么需要花钱才能解决?强大的java,好用的apache 工具系列解决不了么 ?所以决定研究一下。

首先找到了apache pdf 解析的依赖包:

  1. <dependency>
  2. <groupId>org.apache.pdfbox</groupId>
  3. <artifactId>pdfbox</artifactId>
  4. <version>2.0.4</version>
  5. </dependency>
  6. <dependency>
  7. <groupId>net.coobird</groupId>
  8. <artifactId>thumbnailator</artifactId>
  9. <version>0.4.8</version>
  10. </dependency>

另外一个是图片处理的包。

由于pdf 中包含图片和文字,所以我提取图片和文字添加到word 中就好,添加POI依赖。

  1. <dependency>
  2. <groupId>org.apache.poi</groupId>
  3. <artifactId>poi</artifactId>
  4. <version>3.9</version>
  5. </dependency>
  6. <dependency>
  7. <groupId>org.apache.poi</groupId>
  8. <artifactId>poi-ooxml</artifactId>
  9. <version>3.9</version>
  10. </dependency>

然后贴一下代码:

  1. public class Pdf2word {
  2. public static void main(String[] args) throws InvalidFormatException {
  3. try {
  4. String pdfFileName = "H:\\xuweichao.pdf";
  5. PDDocument pdf = PDDocument.load(new File(pdfFileName));
  6. int pageNumber = pdf.getNumberOfPages();
  7. String docFileName = pdfFileName.substring(0, pdfFileName.lastIndexOf(".")) + ".doc";
  8. File file = new File(docFileName);
  9. if (!file.exists()) {
  10. file.createNewFile();
  11. }
  12. CustomXWPFDocument document = new CustomXWPFDocument();
  13. FileOutputStream fos = new FileOutputStream(docFileName);
  14. //提取每一页的图片和文字,添加到 word 中
  15. for (int i = 0; i < pageNumber; i++) {
  16. PDPage page = pdf.getPage(i);
  17. PDResources resources = page.getResources();
  18. Iterable<COSName> names = resources.getXObjectNames();
  19. Iterator<COSName> iterator = names.iterator();
  20. while (iterator.hasNext()) {
  21. COSName cosName = iterator.next();
  22. if (resources.isImageXObject(cosName)) {
  23. PDImageXObject imageXObject = (PDImageXObject) resources.getXObject(cosName);
  24. File outImgFile = new File("H:\\img\\" + System.currentTimeMillis() + ".jpg");
  25. Thumbnails.of(imageXObject.getImage()).scale(0.9).rotate(0).toFile(outImgFile);
  26. BufferedImage bufferedImage = ImageIO.read(outImgFile);
  27. int width = bufferedImage.getWidth();
  28. int height = bufferedImage.getHeight();
  29. if (width > 600) {
  30. double ratio = Math.round((double) width / 550.0);
  31. System.out.println("缩放比ratio:"+ratio);
  32. width = (int) (width / ratio);
  33. height = (int) (height / ratio);
  34. }
  35. System.out.println("width: " + width + ", height: " + height);
  36. FileInputStream in = new FileInputStream(outImgFile);
  37. byte[] ba = new byte[in.available()];
  38. in.read(ba);
  39. ByteArrayInputStream byteInputStream = new ByteArrayInputStream(ba);
  40. XWPFParagraph picture = document.createParagraph();
  41. //添加图片
  42. document.addPictureData(byteInputStream, CustomXWPFDocument.PICTURE_TYPE_JPEG);
  43. //图片大小、位置
  44. document.createPicture(document.getAllPictures().size() - 1, width, height, picture);
  45. }
  46. }
  47. PDFTextStripper stripper = new PDFTextStripper();
  48. stripper.setSortByPosition(true);
  49. stripper.setStartPage(i);
  50. stripper.setEndPage(i);
  51. //当前页中的文字
  52. String text = stripper.getText(pdf);
  53. XWPFParagraph textParagraph = document.createParagraph();
  54. XWPFRun textRun = textParagraph.createRun();
  55. textRun.setText(text);
  56. textRun.setFontFamily("仿宋");
  57. textRun.setFontSize(11);
  58. //换行
  59. textParagraph.setWordWrap(true);
  60. }
  61. document.write(fos);
  62. fos.close();
  63. pdf.close();
  64. System.out.println("pdf转换解析结束!!----");
  65. } catch (IOException e) {
  66. e.printStackTrace();
  67. }
  68. }
  69. }

自定义文档类:

  1. public class CustomXWPFDocument extends XWPFDocument {
  2. public CustomXWPFDocument(InputStream in) throws IOException {
  3. super(in);
  4. }
  5. public CustomXWPFDocument() {
  6. super();
  7. }
  8. public CustomXWPFDocument(OPCPackage pkg) throws IOException {
  9. super(pkg);
  10. }
  11. /**
  12. * @param id
  13. * @param width
  14. * 宽
  15. * @param height
  16. * 高
  17. * @param paragraph
  18. * 段落
  19. */
  20. public void createPicture(int id, int width, int height,
  21. XWPFParagraph paragraph) {
  22. final int EMU = 9525;
  23. width *= EMU;
  24. height *= EMU;
  25. String blipId = getAllPictures().get(id).getPackageRelationship()
  26. .getId();
  27. CTInline inline = paragraph.createRun().getCTR().addNewDrawing()
  28. .addNewInline();
  29. String picXml = ""
  30. + "<a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">"
  31. + " <a:graphicData uri=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">"
  32. + " <pic:pic xmlns:pic=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">"
  33. + " <pic:nvPicPr>" + " <pic:cNvPr id=\""
  34. + id
  35. + "\" name=\"Generated\"/>"
  36. + " <pic:cNvPicPr/>"
  37. + " </pic:nvPicPr>"
  38. + " <pic:blipFill>"
  39. + " <a:blip r:embed=\""
  40. + blipId
  41. + "\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\"/>"
  42. + " <a:stretch>"
  43. + " <a:fillRect/>"
  44. + " </a:stretch>"
  45. + " </pic:blipFill>"
  46. + " <pic:spPr>"
  47. + " <a:xfrm>"
  48. + " <a:off x=\"0\" y=\"0\"/>"
  49. + " <a:ext cx=\""
  50. + width
  51. + "\" cy=\""
  52. + height
  53. + "\"/>"
  54. + " </a:xfrm>"
  55. + " <a:prstGeom prst=\"rect\">"
  56. + " <a:avLst/>"
  57. + " </a:prstGeom>"
  58. + " </pic:spPr>"
  59. + " </pic:pic>"
  60. + " </a:graphicData>" + "</a:graphic>";
  61. inline.addNewGraphic().addNewGraphicData();
  62. XmlToken xmlToken = null;
  63. try {
  64. xmlToken = XmlToken.Factory.parse(picXml);
  65. } catch (XmlException xe) {
  66. xe.printStackTrace();
  67. }
  68. inline.set(xmlToken);
  69. inline.setDistT(0);
  70. inline.setDistB(0);
  71. inline.setDistL(0);
  72. inline.setDistR(0);
  73. CTPositiveSize2D extent = inline.addNewExtent();
  74. extent.setCx(width);
  75. extent.setCy(height);
  76. CTNonVisualDrawingProps docPr = inline.addNewDocPr();
  77. docPr.setId(id);
  78. docPr.setName("图片名称");
  79. docPr.setDescr("描述信息");
  80. }
  81. }

程序就这么简单,遍历文件每一页抽取到pdf 中的图片和文字,文字样式问题暂未解决,但是生成的word 文件对大图片做了比例缩小,对于排版简单的pdf 文件效果还是不错的。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/307926?site
推荐阅读
相关标签
  

闽ICP备14008679号