• WebMagic轻量级爬虫框架实战-根据关键词爬取某网站数据


    1. package com.tjp.www.xuexiangbanweb.webmagic;
    2. import cn.hutool.core.io.FileUtil;
    3. import com.alibaba.fastjson.JSONObject;
    4. import com.tjp.www.xuexiangbanweb.entity.FHNewxVo;
    5. import com.tjp.www.xuexiangbanweb.entity.FHResp;
    6. import com.tjp.www.xuexiangbanweb.entity.NewsCollect;
    7. import com.tjp.www.xuexiangbanweb.service.NewsCollectService;
    8. import com.tjp.www.xuexiangbanweb.util.DownloadFile;
    9. import com.tjp.www.xuexiangbanweb.util.HttpUtils;
    10. import org.apache.commons.lang3.StringUtils;
    11. import org.springframework.beans.factory.annotation.Autowired;
    12. import org.springframework.stereotype.Component;
    13. import us.codecraft.webmagic.Request;
    14. import us.codecraft.webmagic.Spider;
    15. import java.util.ArrayList;
    16. import java.util.List;
    17. import java.util.Map;
    18. import java.util.concurrent.ConcurrentHashMap;
    19. import java.util.concurrent.CopyOnWriteArrayList;
    20. @Component
    21. public class WebMagicStarterFenghuang {
    22. @Autowired
    23. NewsCollectService newsCollectService;
    24. @Autowired
    25. CustomerPipelinefh customerPipelinefh;
    26. @Autowired
    27. NewsPageProcessor2 newsPageProcessor2;
    28. public void doStartReptile() {
    29. String[] keywords=new String[]{"北海舰队","海军航空兵","南沙群岛","辽宁舰","海军人员","南海岛礁","中国海军","南海局势","中国领海","山东舰","军事强国","北海某海域","中国某海域",
    30. "黄岩岛","黄海某海域","南海某海域","战区海军","南海舰队","海军航空兵","执法船","中国海洋资源","中国轮船","海事部门","海洋强国","渔业捕捞","海洋战略","海上应急",
    31. "中国岛礁","渔政","海洋战略","水产品","船舶管理","海洋资源","中国轮船","海上执法","海洋部门","东海某海域","海上演练","海航","中国渔船","渔业政策","黄岩岛",
    32. "驱逐舰","舰艇","海上演习","钢铁海洋","渔政部门","渔民","战斗舰","海洋鱼类","海洋生态","海上打捞","禁渔期","海上船只","海洋政策","海上巡航","建设海洋生态","发展海洋","海洋经济",
    33. "船事故","海警部门","中国水产","台湾海峡","海军部队","沿海港航","沿海航行"};
    34. List list = new ArrayList<>();
    35. for (String keywordss : keywords) {
    36. //爬取目标网站url,
    37. String urls = "https://www.xxxxxx.com/xxxx/xxxxx/xxx/"+keywordss+"/1/xxxx?callback=getSoFengDataCallback&_=16502467818311";
    38. String s = HttpUtils.get(urls);
    39. String ss = s.replace("getSoFengDataCallback", "");
    40. String substring = ss.substring(1, ss.length() - 1);
    41. FHResp resp = JSONObject.parseObject(substring, FHResp.class);
    42. int totalPage = resp.getData().getTotalPage();
    43. for (int i = 1; i <=totalPage; i++) {
    44. String url = "https://shankapi.ifeng.com/season/getSoFengData/all/"+keywordss+"/"+i+"/getSoFengDataCallback?callback=getSoFengDataCallback&_=16502467818311";
    45. String sss = HttpUtils.get(url);
    46. String spjson = sss.replace("getSoFengDataCallback", "");
    47. String substrings = spjson.substring(1, spjson.length() - 1);
    48. FHResp fhResp = JSONObject.parseObject(substrings, FHResp.class);
    49. if(fhResp.getData().getItems().size()>0){
    50. List items = fhResp.getData().getItems();
    51. for (FHNewxVo item : items) {
    52. if(StringUtils.isNotBlank(item.getUrl())){
    53. Map map=new ConcurrentHashMap<>();
    54. map.put("keyword",keywordss);
    55. String deatilUrl="https:"+item.getUrl();
    56. Request request = new Request(deatilUrl);
    57. request.setExtras(map);
    58. list.add(request);
    59. }
    60. }
    61. }
    62. }
    63. }
    64. System.out.println("数量--》"+list.size());
    65. //获取所有详情页URL,进行请求
    66. Request[] requests = list.toArray(new Request[list.size()]);
    67. Spider.create(newsPageProcessor2)
    68. //.addUrl(url)
    69. .addRequest(requests)
    70. //开启5个线程抓取
    71. .addPipeline(customerPipelinefh)
    72. .thread(10)
    73. //启动爬虫
    74. .run();
    75. System.out.println("读到--》"+CustomerPipelinefh.map.size());
    76. //System.out.println("读到--》"+CustomerPipelinefh.map.toString());
    77. /* 图片存放分文件夹 */
    78. String dowFilePath = "E:/images";
    79. if (!FileUtil.exist(dowFilePath)) {
    80. FileUtil.mkdir(dowFilePath);
    81. }
    82. //取数据 存数据库
    83. List lists=new CopyOnWriteArrayList<>();
    84. CustomerPipelinefh.map.forEach((k, v) -> {
    85. lists.add(v);
    86. });
    87. System.out.println("读取到的数据,去重前---》"+lists.size());
    88. int bs=0;
    89. int as=0;
    90. for (NewsCollect newsCollect : lists) {
    91. String s = newsCollect.getPublishtime().split("/")[0];
    92. if (StringUtils.isNotBlank(newsCollect.getAttachments())) {
    93. String jpg = newsCollect.getAttachments().substring(newsCollect.getAttachments().lastIndexOf("."));
    94. String timePre = newsCollect.getPublishtime();
    95. if (StringUtils.isNotBlank(newsCollect.getPublishtime())) {
    96. String[] timeSpilt = newsCollect.getPublishtime().substring(0, 10).split("-");
    97. System.out.println("时间---》"+timeSpilt.toString());
    98. timePre = timeSpilt[0] + timeSpilt[1] + timeSpilt[2];
    99. }
    100. String jpgName = timePre + "_" +
    101. newsCollect.getPublishpersoninfo() + "_" +
    102. newsCollect.getTitle().replaceAll("\\s*", "") + "_" + newsCollect.getNewssource() + jpg;
    103. boolean b = DownloadFile.downloadA(newsCollect.getAttachments(), "E:\\images\\" + jpgName);
    104. if (b) {//成功了在设置
    105. newsCollect.setAttachmentsurl("images/" + jpgName);
    106. System.out.println("成功下载了---》" + (++bs));
    107. }
    108. System.out.println("下载了---》" + (++as));
    109. }
    110. }
    111. System.out.println("数据量--》"+lists.size());
    112. // System.out.println("数据量内容--》"+lists.toString());
    113. //存数据库都是有效的
    114. newsCollectService.saveBatch(lists);
    115. CustomerPipelinefh.map.clear();
    116. System.out.println("清除map-->"+CustomerPipelinefh.map.size());
    117. }
    118. }

    Piepeline组件逻辑

    1. package com.tjp.www.xuexiangbanweb.webmagic;
    2. import cn.hutool.core.util.ObjectUtil;
    3. import com.tjp.www.xuexiangbanweb.entity.NewsCollect;
    4. import com.tjp.www.xuexiangbanweb.mapper.NewsCollectMapper;
    5. import org.apache.commons.lang3.StringUtils;
    6. import org.springframework.stereotype.Component;
    7. import us.codecraft.webmagic.ResultItems;
    8. import us.codecraft.webmagic.Task;
    9. import us.codecraft.webmagic.pipeline.Pipeline;
    10. import javax.annotation.Resource;
    11. import java.util.Map;
    12. import java.util.concurrent.ConcurrentHashMap;
    13. @Component
    14. public class CustomerPipelinefh implements Pipeline {
    15. @Resource
    16. NewsCollectMapper newsCollectMapper;
    17. public static Map map=new ConcurrentHashMap<>();
    18. @Override
    19. public void process(ResultItems resultItems, Task task) {
    20. if (!ObjectUtil.isEmpty(resultItems.get("newsObject"))) {
    21. NewsCollect newsObject = resultItems.get("newsObject");
    22. if(StringUtils.isNotBlank(newsObject.getTitle())) {
    23. map.put(newsObject.getTitle()+newsObject.getPublishtime(), newsObject);
    24. }
    25. }
    26. }
    27. }

    PageProcessor组件逻辑
    1. package com.tjp.www.xuexiangbanweb.webmagic;
    2. import cn.hutool.core.util.IdUtil;
    3. import com.tjp.www.xuexiangbanweb.entity.NewsCollect;
    4. import org.apache.commons.lang3.StringUtils;
    5. import org.jsoup.nodes.Document;
    6. import org.jsoup.nodes.Element;
    7. import org.jsoup.select.Elements;
    8. import org.springframework.stereotype.Component;
    9. import us.codecraft.webmagic.Page;
    10. import us.codecraft.webmagic.Site;
    11. import us.codecraft.webmagic.processor.PageProcessor;
    12. @Component
    13. public class NewsPageProcessor2 implements PageProcessor {
    14. @Override
    15. public void process(Page page) {// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
    16. detailPages(page);
    17. }
    18. public void detailPages(Page page){
    19. String keyword = page.getRequest().getExtra("keyword").toString();
    20. Document document = page.getHtml().getDocument();
    21. String url = page.getRequest().getUrl();
    22. Elements otherBody = document.getElementsByClass("atlas-4Dz3AOM8");
    23. String title = document.getElementsByClass("title-14yWv8ay").text();
    24. String person = document.getElementsByClass("source-3cecBclA").text();
    25. String time = document.getElementsByClass("time-M6w87NaQ").text();
    26. Elements p1 = document.getElementsByClass("smallFont-Z_OfA44W\n" +
    27. " text-20BABGxP\n" +
    28. " ").select("p");
    29. String text = document.getElementsByClass("videoArea-3bC47kpL").text();
    30. //不考虑视频 08/14 41:12 2021/18/01 17:52
    31. if(!StringUtils.isNotBlank(text)&&StringUtils.isNotBlank(time)&&StringUtils.isNotBlank(p1.toString())) {
    32. String[] splitTime = time.split("/");
    33. if(splitTime.length<3){
    34. time="2021/"+time;
    35. }
    36. if("2022".equals(time.split("/")[0])||"2021".equals(time.split("/")[0])) {
    37. String body = "";
    38. for (Element element : p1) {
    39. body += element.text();
    40. }
    41. if (!StringUtils.isNotBlank(body)) {
    42. for (Element element : otherBody) {
    43. body += element.getElementsByClass("atlasBox-2CqWytIX").next().text();
    44. }
    45. }
    46. if (body.length() > 32767) {
    47. body = body.substring(0, 32767);
    48. }
    49. String imgUrl = "";
    50. Elements img = p1.select("img");
    51. if (!img.isEmpty()) {
    52. for (Element element : img) {
    53. String src = element.attr("src");
    54. imgUrl = src;
    55. break;
    56. }
    57. }
    58. NewsCollect newsCollect = new NewsCollect();
    59. newsCollect.setId(IdUtil.getSnowflakeNextId());
    60. newsCollect.setTitle(title.replaceAll("\\s*", ""));
    61. newsCollect.setPublishpersoninfo(person);// 01/17 18:30
    62. newsCollect.setPublishtime(time.replaceAll("/","-"));
    63. newsCollect.setNewsinfo(body);
    64. newsCollect.setAttachments(imgUrl);
    65. newsCollect.setNewssource("凤凰网");
    66. newsCollect.setLinksnum("0");
    67. newsCollect.setForwardnum("0");
    68. newsCollect.setCommentnum("0");
    69. newsCollect.setEventtype("文章");
    70. newsCollect.setKeyword(keyword);
    71. newsCollect.setUrl(url);
    72. page.putField("newsObject", newsCollect);
    73. }
    74. }
    75. }
    76. @Override
    77. public Site getSite() { // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等
    78. return Site.me().setRetryTimes(10)
    79. .setCharset("utf-8").setTimeOut(6000)
    80. .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36");
    81. }
    82. }

  • 相关阅读:
    【国际化多语言方案】i18n / class google sheets v4 api 在线文档同步json
    python基于django教学作业管理系统(源码+系统+mysql数据库+Lw文档)
    Stable Diffusion 2.0 来了
    快速理解DDD领域驱动设计架构思想-基础篇
    一句话总结设计模式
    华为云云耀云服务器L实例评测|使用Portainer快速部署物联网低代码平台Node-RED
    webservice接口调用
    C++11
    [MATLAB]进阶绘图
    企业数据文化建设,为什么会加速商业智能BI在企业的发展?
  • 原文地址:https://blog.csdn.net/qq_37831937/article/details/139266479