赞
踩
1. maven的pom.xml文件中引入Jsoup的sdk
- <dependency>
- <groupId>org.jsoup</groupId>
- <artifactId>jsoup</artifactId>
- <version>1.14.3</version>
- </dependency>
2. 直接写具体的爬虫方法
Controller层
- /**
- * 插件导入
- * https://api.oioweb.cn 该网站
- */
- @PostMapping(value = "/importPlugins")
- @ApiOperation(value = "插件导入", notes = "获取插件的详细请求信息")
- public R<Map<String, Object>> importPlugins(@RequestBody CrawlerDto crawlerDto) {
- return R.ok(wdPluginsService.importPlugins(crawlerDto.getUrl()));
- }
Service层
Map<String, Object> importPlugins(String url);
ServiceImpl实现类
- @Override
- public Map<String, Object> importPlugins(String url){
- Map<String, Object> returnMap = new HashMap<>();
- // 教书先生api
- if(url.contains("api.oioweb.cn/doc")){
- returnMap = getUrlInfoByTeachMan(url);
- }
- return returnMap;
- }
-
- private Map<String, Object> getUrlInfoByTeachMan(String url){
- Map<String, Object> returnMap = new HashMap<>();
- try {
- Document document = Jsoup.connect(url).get();
- // 获取表头基础信息
- Elements fieldset = document.select("fieldset");
- Elements blockquote = document.select("blockquote");
- returnMap.put("title", fieldset.get(0).text());
- returnMap.put("describe", blockquote.get(0).text());
- // 定位到class为"layui-tab-item layui-show"的div元素
- Elements divElements = document.select("div.layui-tab-item.layui-show");// 根据实际情况修改选择器
- if (divElements != null) {
- Elements pElements = divElements.select("p.simpleTable");
- String faceUrl = pElements.get(0).text();
- String returnFomat = pElements.get(1).text();
- String method = pElements.get(2).text();
- String reExample = pElements.get(3).text();
- returnMap.put("faceUrl", faceUrl.substring(faceUrl.indexOf(":", 1) + 1));
- returnMap.put("returnFomat", returnFomat.substring(returnFomat.indexOf(": ") + 1));
- returnMap.put("method", method.substring(method.indexOf(": ") + 1));
- returnMap.put("reExample", reExample.substring(reExample.indexOf(":", 1) + 1));
-
-
- // 获取该div元素下的所有子元素
- for (Element div : divElements) {
- // 这里可以添加您想要对每个div执行的操作
- Elements tableElements = div.select("table");
- Elements thead = tableElements.select("thead");
- // 请求参数头
- Elements headThs = thead.get(0).select("th");
- List<String> requestHeaders = new ArrayList<>();
- for (Element th : headThs) {
- requestHeaders.add(th.text());
- }
- // 参数头
- Elements bodyThs = thead.get(1).select("th");
- List<String> responseHeaders = new ArrayList<>();
- for (Element th : bodyThs) {
- responseHeaders.add(th.text());
- }
-
- Elements tbody = tableElements.select("tbody");
- Elements trs = tbody.select("tr");
- List<Map<String, String>> reqList = new ArrayList<>();
- List<Map<String, String>> resList = new ArrayList<>();
- for (Element tr : trs) {
- Elements tds = tr.select("td");
- if (tds.size() == 4) { // 请求参数处理
- Map<String, String> reqData = new HashMap<>();
- for (int i = 0; i < requestHeaders.size(); i++) {
- reqData.put(requestHeaders.get(i), tds.get(i).text());
- }
- reqList.add(reqData);
-
- } else { // size()为3则是返回参数处理
- Map<String, String> resData = new HashMap<>();
- for (int i = 0; i < responseHeaders.size(); i++) {
- resData.put(responseHeaders.get(i), tds.get(i).text());
- }
- resList.add(resData);
-
- }
-
- }
- returnMap.put("requestList", reqList);
- returnMap.put("responseList", resList);
- System.out.println(returnMap);
- }
- }
- }catch (IOException e){
- e.printStackTrace();
- }
- return returnMap;
- }

3. 总结 :Java爬虫是根据某个网站的固定同一个页面批量爬取想获取的相关信息
如有建议,欢迎指教!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。