当前位置:   article > 正文

Java爬虫之Jsoup爬Html网页链接

Java爬虫之Jsoup爬Html网页链接

1. maven的pom.xml文件中引入Jsoup的sdk

  1. <dependency>
  2. <groupId>org.jsoup</groupId>
  3. <artifactId>jsoup</artifactId>
  4. <version>1.14.3</version>
  5. </dependency>

2. 直接写具体的爬虫方法

Controller层

  1. /**
  2. * 插件导入
  3. * https://api.oioweb.cn 该网站
  4. */
  5. @PostMapping(value = "/importPlugins")
  6. @ApiOperation(value = "插件导入", notes = "获取插件的详细请求信息")
  7. public R<Map<String, Object>> importPlugins(@RequestBody CrawlerDto crawlerDto) {
  8. return R.ok(wdPluginsService.importPlugins(crawlerDto.getUrl()));
  9. }

Service层 

Map<String, Object> importPlugins(String url);

ServiceImpl实现类

  1. @Override
  2. public Map<String, Object> importPlugins(String url){
  3. Map<String, Object> returnMap = new HashMap<>();
  4. // 教书先生api
  5. if(url.contains("api.oioweb.cn/doc")){
  6. returnMap = getUrlInfoByTeachMan(url);
  7. }
  8. return returnMap;
  9. }
  10. private Map<String, Object> getUrlInfoByTeachMan(String url){
  11. Map<String, Object> returnMap = new HashMap<>();
  12. try {
  13. Document document = Jsoup.connect(url).get();
  14. // 获取表头基础信息
  15. Elements fieldset = document.select("fieldset");
  16. Elements blockquote = document.select("blockquote");
  17. returnMap.put("title", fieldset.get(0).text());
  18. returnMap.put("describe", blockquote.get(0).text());
  19. // 定位到class为"layui-tab-item layui-show"的div元素
  20. Elements divElements = document.select("div.layui-tab-item.layui-show");// 根据实际情况修改选择器
  21. if (divElements != null) {
  22. Elements pElements = divElements.select("p.simpleTable");
  23. String faceUrl = pElements.get(0).text();
  24. String returnFomat = pElements.get(1).text();
  25. String method = pElements.get(2).text();
  26. String reExample = pElements.get(3).text();
  27. returnMap.put("faceUrl", faceUrl.substring(faceUrl.indexOf(":", 1) + 1));
  28. returnMap.put("returnFomat", returnFomat.substring(returnFomat.indexOf(": ") + 1));
  29. returnMap.put("method", method.substring(method.indexOf(": ") + 1));
  30. returnMap.put("reExample", reExample.substring(reExample.indexOf(":", 1) + 1));
  31. // 获取该div元素下的所有子元素
  32. for (Element div : divElements) {
  33. // 这里可以添加您想要对每个div执行的操作
  34. Elements tableElements = div.select("table");
  35. Elements thead = tableElements.select("thead");
  36. // 请求参数头
  37. Elements headThs = thead.get(0).select("th");
  38. List<String> requestHeaders = new ArrayList<>();
  39. for (Element th : headThs) {
  40. requestHeaders.add(th.text());
  41. }
  42. // 参数头
  43. Elements bodyThs = thead.get(1).select("th");
  44. List<String> responseHeaders = new ArrayList<>();
  45. for (Element th : bodyThs) {
  46. responseHeaders.add(th.text());
  47. }
  48. Elements tbody = tableElements.select("tbody");
  49. Elements trs = tbody.select("tr");
  50. List<Map<String, String>> reqList = new ArrayList<>();
  51. List<Map<String, String>> resList = new ArrayList<>();
  52. for (Element tr : trs) {
  53. Elements tds = tr.select("td");
  54. if (tds.size() == 4) { // 请求参数处理
  55. Map<String, String> reqData = new HashMap<>();
  56. for (int i = 0; i < requestHeaders.size(); i++) {
  57. reqData.put(requestHeaders.get(i), tds.get(i).text());
  58. }
  59. reqList.add(reqData);
  60. } else { // size()为3则是返回参数处理
  61. Map<String, String> resData = new HashMap<>();
  62. for (int i = 0; i < responseHeaders.size(); i++) {
  63. resData.put(responseHeaders.get(i), tds.get(i).text());
  64. }
  65. resList.add(resData);
  66. }
  67. }
  68. returnMap.put("requestList", reqList);
  69. returnMap.put("responseList", resList);
  70. System.out.println(returnMap);
  71. }
  72. }
  73. }catch (IOException e){
  74. e.printStackTrace();
  75. }
  76. return returnMap;
  77. }

3. 总结 :Java爬虫是根据某个网站的固定同一个页面批量爬取想获取的相关信息

如有建议,欢迎指教!

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/一键难忘520/article/detail/750450
推荐阅读
相关标签
  

闽ICP备14008679号