java实现爬虫技术，读取txt，word，excel，ppt，pdf，html等格式的文件_java如何爬取网站上的pdf

作者：笔触狂放9 | 2024-06-21 11:52:40

踩

java如何爬取网站上的pdf

最近跟我同事一起做的项目要求读取txt,word,excel,ppt,pdf,html中的内容，不多说，先把代码贴出来，之后有时间再来做详细的解读。

这是读取txt文件

[html]view plaincopy 
   
 /**  
      * 获取txt的文件内容 新建的默认格式 ，其它三种格式会乱码  
      *   
      * @param txtFile  
      * @return  
      */  
     public String GetTxtContent(File txtFile) {  
         BufferedReader reader = null;  
           
         String tempString = null;  
         StringBuffer contents = new StringBuffer();  
         try {  
             reader = new BufferedReader(new FileReader(txtFile));  
             while ((tempString = reader.readLine()) != null) {  
                 contents.append(tempString);  
             }  
             reader.close();  
         } catch (FileNotFoundException e) {  
             e.printStackTrace();  
         } catch (IOException e) {  
             e.printStackTrace();  
         } finally {  
             if (reader != null) {  
                 try {  
                     reader.close();  
                 } catch (IOException e) {  
                     e.printStackTrace();  
                 }  
             }  
         }  
         return contents.toString().trim();  
     }  

[html]view plaincopy 
   
 <h1>读取ppt</h1>  /**  
      * 读取PPT的内容  
      *   
      * @param excleFile  
      * @return  
      */  
     public String GetPPTContent(File excleFile) {   
         StringBuffer contents = new StringBuffer("");// 文档内容  
         InputStream is = null;  
         SlideShow ppt = null;  
         try {  
             is = new FileInputStream(excleFile);  
             ppt = new SlideShow(new HSLFSlideShow(is));  
         } catch (FileNotFoundException e1) {  
             e1.printStackTrace();  
         } catch (IOException e1) {  
             e1.printStackTrace();  
         }  
         Slide[] slides = ppt.getSlides();  
   
         for (int i = 0; i < slides.length; i++) {  
             TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容，建立TextRun  
             for (int j = 0; j < t.length; j++) {  
                 contents.append(t[j].getText());// 这里会将文字内容加到content中去  
             }  
         }  
         if (is != null) {  
             try {  
                 is.close();  
             } catch (IOException e) {  
                 e.printStackTrace();  
             }  
         }  
         return contents.toString().trim();  
     }  

[html]view plaincopy 
   
 <h1>读取excel</h1>    /**  
      * 获取2007excle的内容  
      *   
      * @param exclexlsxFile  
      * @return  
      */  
     public String GetExclexlsxContent(File exclexlsxFile) {  
         StringBuffer content = null;  
         XSSFWorkbook workbook = null;  
         InputStream in = null;  
         try {  
             in = new FileInputStream(exclexlsxFile);  
             content = new StringBuffer();  
             workbook = new XSSFWorkbook(in);  
         } catch (FileNotFoundException e) {  
             e.printStackTrace();  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
   
         for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {  
             XSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet  
             content.append("\n");  
             if (null == aSheet) {  
                 continue;  
             }  
             for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {  
                 content.append("\n");  
                 XSSFRow aRow = aSheet.getRow(rowNum);  
                 if (null == aRow) {  
                     continue;  
                 }  
   
                 for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {  
                     XSSFCell aCell = aRow.getCell(cellNum);  
                     if (null == aCell) {  
                         continue;  
                     }  
                     if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {  
                         content.append(aCell.getRichStringCellValue()  
                                 .getString());  
                     } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {  
                         boolean b = HSSFDateUtil.isCellDateFormatted(aCell);  
                         if (b) {  
                             Date date = aCell.getDateCellValue();  
                             SimpleDateFormat df = new SimpleDateFormat(  
                                     "yyyy-MM-dd HH:mm:ss");  
                             content.append(df.format(date));  
                         }  
                     }  
                 }  
             }  
         }  
         if (in != null) {  
             try {  
                 in.close();  
             } catch (IOException e) {  
                 e.printStackTrace();  
             }  
         }  
   
         return content.toString().trim();  
     }  
     /**  
      * 读取excle的内容  
      *   
      * @param excleFile  
      * @return  
      */  
     public String GetExcleContent(File excleFile) {  
         StringBuffer content = null;  
         HSSFWorkbook workbook = null;  
         InputStream in = null;  
         try {  
             in = new FileInputStream(excleFile);  
             content = new StringBuffer();  
             workbook = new HSSFWorkbook(in);  
         } catch (FileNotFoundException e) {  
             e.printStackTrace();  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
   
         for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {  
             HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet  
             content.append("\n");  
             if (null == aSheet) {  
                 continue;  
             }  
             for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {  
                 content.append("\n");  
                 HSSFRow aRow = aSheet.getRow(rowNum);  
                 if (null == aRow) {  
                     continue;  
                 }  
   
                 for (int cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {  
                     HSSFCell aCell = aRow.getCell(cellNum);  
                     if (null == aCell) {  
                         continue;  
                     }  
   
                     if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {  
                         content.append(aCell.getRichStringCellValue()  
                                 .getString());  
                     } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {  
                         boolean b = HSSFDateUtil.isCellDateFormatted(aCell);  
                         if (b) {  
                             Date date = aCell.getDateCellValue();  
                             SimpleDateFormat df = new SimpleDateFormat(  
                                     "yyyy-MM-dd HH:mm:ss");  
                             content.append(df.format(date));  
                         }  
                     }  
                 }  
             }  
         }  
         if (in != null) {  
             try {  
                 in.close();  
             } catch (IOException e) {  
                 e.printStackTrace();  
             }  
         }  
   
         return content.toString().trim();  
     }  

[html]view plaincopy 
   
 <span style="font-size:48px;">读取word</span>  
     /**  
      * 获取word的内容  
      *   
      * @param wordPath  
      *            文件  
      * @return word的内容  
      */  
     @SuppressWarnings("resource")  
     public  String GetWordContent(File wordFile) {  
         String strContent = "";  
         FileInputStream in=null;  
         try {  
             in = new FileInputStream(wordFile);  
             WordExtractor text = new WordExtractor(in);  
             strContent = text.getText();  
         } catch (Exception e) {  
             e.printStackTrace();  
         }finally{  
             if(in!=null){  
                 try {  
                     in.close();  
                 } catch (IOException e) {  
                     // TODO Auto-generated catch block  
                     e.printStackTrace();  
                 }  
             }  
         }  
   
         return strContent.trim();  
     }  
     /**  
      * 获取word2007的内容  
      *   
      * @param word2007Path  
      * @return  
      * @throws Exception  
      */  
     public String GetWordDocxContent(File wordDocxFile) {  
         POIXMLTextExtractor extractor;  
         String text2007 = "";  
         try {  
             OPCPackage opcPackage = POIXMLDocument.openPackage(wordDocxFile  
                     .getPath());  
             extractor = new XWPFWordExtractor(opcPackage);  
             text2007 = extractor.getText();  
         } catch (IOException e) {  
             e.printStackTrace();  
         } catch (XmlException e) {  
             e.printStackTrace();  
         } catch (OpenXML4JException e) {  
             e.printStackTrace();  
         }  
         return text2007.trim();  
     }  

[html]view plaincopy 
   
 <span style="font-size:48px;">读取pdf</span>  

[html]view plaincopy 
   
 /**  
  * 读取PDF文字的内容  
  *   
  * @param pdfPath  
  *            pdf  
  * @return 返回pdf文件的内容  
  */  
 public String GetPDFContent(File pdfFile) {  
     String content = "";  
     FileInputStream is = null;  
     PDDocument doc = null;  
     try {  
         is = new FileInputStream(pdfFile);  
         PDFParser parser = new PDFParser(is);  
         parser.parse();  
         doc = parser.getPDDocument();  
         PDFTextStripper stripper = new PDFTextStripper();  
         content = stripper.getText(doc);  
     } catch (Exception e) {  
         e.printStackTrace();  
     } finally {  
         if (is != null) {  
             try {  
                 is.close();  
             } catch (Exception e) {  
                 e.printStackTrace();  
             }  
         }  
         if (doc != null) {  
             try {  
                 doc.close();  
             } catch (Exception e) {  
                 e.printStackTrace();  
             }  
         }  
     }  
     return content.trim();  
 }  

[html]view plaincopy 
   
 <span style="font-size:48px;">读取html</span>  
     /**  
      * 读取网页纯文本内容用来存储索引方法*/  
     public  String GetHTML(String url) throws ParserException{  
         Parser parser = new Parser(url);  
         StringBean sb=new StringBean();  
         //設置不需要頁面的鏈接信息  
         sb.setLinks(false);  
         //設置將不間斷空格由正規空格替代  
         sb.setReplaceNonBreakingSpaces(true);  
         //設置一系列空格由單一空格代替  
         sb.setCollapse(true);  
         parser.visitAllNodesWith(sb);  
         return sb.getStrings().trim();  
     }  
     /**@param filePath   
      * 文件上傳路徑  
      * 处理附件方法 获得JSON数组  
      * @throws Exception */  
     public String HandleFj(String param,IService service,String filePath) throws Exception{  
         JSONArray json=null;  
         ArrayList<IEntity>list=null;  
         String sql="";  
         String fjtotalpath="";  
         try {  
             json=JSONArray.fromObject(DataObject.getObjectValue("param"));  
         } catch (Exception e) {  
             e.printStackTrace();  
             return "";  
         }  
         if(!StringHelper.isNullOrEmpty(json)){  
             StringBuffer fjcontenttotal=new StringBuffer();  
             for(int i=0;i<json.length();i++){  
                 String fileid=json.getJSONObject(i).getString("id");//拿到fileid  
                 String name=json.getJSONObject(i).getString("name");  
                 if(!StringHelper.isNullOrEmpty(fileid)&&!StringHelper.isNullOrEmpty(name)){  
                     sql="select t.localpath from t_srffile t where t.file_id='"+fileid+"'";  
                     try {  
                         list=service.selectRaw(sql, null);  
                     } catch (Exception e) {  
                         e.printStackTrace();  
                     }  
                     for(IEntity o:list){  
                         String location=DataObject.getStringValue(o.get("location"));  
                         fjtotalpath=filePath+location;  
                         fjcontenttotal.append(this.GetFileContent(fjtotalpath));  
                     }  
                 }  
                 return fjcontenttotal.toString();  
             }  
         }  
         return "";  
     }  

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/w/笔触狂放9/article/detail/743035

java实现爬虫技术，读取txt，word，excel，ppt，pdf，html等格式的文件_java如何爬取网站上的pdf

最近跟我同事一起做的项目要求读取txt,word,excel,ppt,pdf,html中的内容，不多说，先把代码贴出来，之后有时间再来做详细的解读。 这是读取txt文件

最近跟我同事一起做的项目要求读取txt,word,excel,ppt,pdf,html中的内容，不多说，先把代码贴出来，之后有时间再来做详细的解读。

这是读取txt文件