• .NET 爬虫从入门到入狱


    目录

    前言

    1.💡使用HttpClient爬取数据

    2.🚀模拟User-Agent

    3.🤵使用HTML解析库

    3.👌前端Price显示

    4.🌱运行实例 获取金价Au

    5.🧾使用正则表达式解析

    6.💫获取BTC价格

     7.✨获取CSDN热点

    8.🎃 获取编程语言排行榜

    9.🖥️获取小破站热门

    10.⭐获取某音热门 


    前言

    爬虫可以用于外汇,期货,基金,货币,比分,电商,文章等信息的采集。通过数据分析,竞品分析,为商业决策提供数据支持。

    金价暴涨,抓取下今日的金价。

    1.💡使用HttpClient爬取数据

    1. try
    2. {
    3. // 创建HttpClient实例
    4. using (var httpClient = new HttpClient())
    5. {
    6. //模拟User-Agent
    7. httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
    8. // 发送GET请求并获取响应 xxx.com是某网站的页面~(保护)
    9. var response = await httpClient.GetAsync("https://xxx.com");
    10. response.EnsureSuccessStatusCode();
    11. var htmlContent = await response.Content.ReadAsStringAsync();
    12. }
    13. }
    14. catch (HttpRequestException ex)
    15. {
    16. // 处理请求异常
    17. ViewBag.Error = "Failed to retrieve price data: " + ex.Message;
    18. }

    2.🚀模拟User-Agent

    1. // 生成随机 User-Agent
    2. private string GetUserAgent()
    3. {
    4. string[] userAgents = new string[]
    5. {
    6. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
    7. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
    8. //
    9. };
    10. Random random = new Random();
    11. int index = random.Next(userAgents.Length);
    12. return userAgents[index];
    13. }

    3.🤵使用HTML解析库

    (HtmlAgilityPack)提取价格

    1. // 使用HtmlAgilityPack解析HTML
    2. var htmlDocument = new HtmlDocument();
    3. htmlDocument.LoadHtml(htmlContent);
    4. // 使用XPath表达式选择指定的元素
    5. var trElement = htmlDocument.DocumentNode.SelectSingleNode("//tr[contains(@class, 'border_ea') and contains(@class, 'noTop_border')]");
    6. if (trElement != null)
    7. {
    8. // 获取包含价格的元素
    9. var priceCell = trElement.SelectSingleNode(".//td[@align='center'][2]");
    10. if (priceCell != null)
    11. {
    12. // 提取价格
    13. string price = priceCell.InnerText.Trim();
    14. // 将价格传递给视图
    15. ViewBag.Price = price;
    16. }
    17. else
    18. {
    19. ViewBag.Error = "Price cell not found in table row.";
    20. }
    21. }
    22. else
    23. {
    24. ViewBag.Error = "Table row not found.";
    25. }

    也可以用正则表达式提取

    3.👌前端Price显示

    1. <h1>Au99.99 Price: @ViewBag.Priceh1>
    2. <h2>@ViewBag.Errorh2>

    4.🌱运行实例 获取金价Au

    也可以通过修改规则取实时Pt的价格

    1. // 使用XPath表达式选择指定的元素
    2. var trElements = htmlDocument.DocumentNode.SelectNodes("//tr[contains(@class, 'border_ea') and contains(@class, 'noTop_border ')]");
    3. if (trElements != null)
    4. {
    5. foreach (var trElement in trElements)
    6. {
    7. // 获取包含产品名称和价格的元素
    8. var tdElements = trElement.SelectNodes("./td");
    9. if (tdElements != null && tdElements.Count >= 5)
    10. {
    11. // 获取产品名称、最新价、最高价、最低价和今开盘价格
    12. string productName = tdElements[0].InnerText.Trim();
    13. string latestPrice = tdElements[1].InnerText.Trim();
    14. string highestPrice = tdElements[2].InnerText.Trim();
    15. string lowestPrice = tdElements[3].InnerText.Trim();
    16. string openingPrice = tdElements[4].InnerText.Trim();
    17. // 检查产品名称是否为Au99.99或Pt99.95
    18. if (productName == "Au99.99")
    19. {
    20. // 将Au99.99价格信息传递给视图
    21. ViewBag.AuLatestPrice = latestPrice;
    22. ViewBag.AuHighestPrice = highestPrice;
    23. ViewBag.AuLowestPrice = lowestPrice;
    24. ViewBag.AuOpeningPrice = openingPrice;
    25. }
    26. else if (productName == "Pt99.95")
    27. {
    28. // 将Pt99.95价格信息传递给视图
    29. ViewBag.PtLatestPrice = latestPrice;
    30. ViewBag.PtHighestPrice = highestPrice;
    31. ViewBag.PtLowestPrice = lowestPrice;
    32. ViewBag.PtOpeningPrice = openingPrice;
    33. }
    34. }
    35. }
    36. }

    结果如下:

    5.🧾使用正则表达式解析

    通过httpClient请求

    1. public async Task<ActionResult> Index()
    2. {
    3. // 获取当前时间的时间戳
    4. long timestamp = DateTimeOffset.Now.ToUnixTimeMilliseconds();
    5. // 构建URL
    6. string url = $"http://www.xxx.cn/xx.js?t={timestamp}";
    7. // 创建HttpClient实例
    8. using (var httpClient = new HttpClient())
    9. {
    10. // 设置 User-Agent
    11. httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
    12. try
    13. {
    14. // 发送GET请求
    15. var response = await httpClient.GetAsync(url);
    16. response.EnsureSuccessStatusCode();
    17. // 读取返回的数据
    18. var responseData = await response.Content.ReadAsStringAsync();
    19. // 解析返回的数据
    20. var price = ParseGoldPrice(responseData);
    21. // 将价格传递给视图
    22. ViewBag.Price = price;
    23. }
    24. catch (HttpRequestException ex)
    25. {
    26. // 处理请求异常
    27. ViewBag.Error = "Failed to retrieve gold price data.";
    28. }
    29. }
    30. // 返回视图
    31. return View();
    32. }

    处理返回价格 

    通过正则表达式匹配数据

    1. private decimal ParseGoldPrice(string responseData)
    2. {
    3. string price = "";
    4. decimal price2 = 0;
    5. string data = responseData;//
    6. // 匹配价格的正则表达式
    7. string pattern = @"var hq_str_gds_AUTD=""([^""]+)"";";
    8. // 使用正则表达式匹配数据
    9. Match match = Regex.Match(data, pattern);
    10. if (match.Success)
    11. {
    12. // 获取匹配到的价格数据
    13. string priceData = match.Groups[1].Value;
    14. // 使用逗号分割数据,取第一个元素作为价格
    15. string[] priceParts = priceData.Split(',');
    16. price = priceParts[0];
    17. // 将字符串价格转换为decimal类型
    18. price2 = decimal.Parse(price);
    19. // 输出提取到的价格
    20. Console.WriteLine("Gold Price: " + price2);
    21. }
    22. else
    23. {
    24. Console.WriteLine("Price not found in data.");
    25. }
    26. // decimal price = decimal.Parse(priceString);
    27. return price2;
    28. }

    6.💫获取BTC价格

    Headers模拟cookie获取BTC价格

    1. httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
    2. httpClient.DefaultRequestHeaders.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
    3. httpClient.DefaultRequestHeaders.Add("Accept-Language", "zh-CN,zh;q=0.9");
    4. httpClient.DefaultRequestHeaders.Add("Cache-Control", "max-age=0");
    5. //cookie
    6. httpClient.DefaultRequestHeaders.Add("Cookie", "__51uvsct__3ExGyQaAoNSqsSUY=1; __51vcke__3ExGyQaAoNSqsSUY=c91184d5-8826-5ea8-8ddc-a0f3b85c9470; __51vuft__3ExGyQaAoNSqsSUY=1713248151570; PHPSESSID=a6ff05p79bbf3ot4ohphein0e1; Hm_lvt_1605442054faab140873b7c14e40c707=1713248152; Hm_lvt_4820535acfded9186d46b7ae0c829918=1713248152; __vtins__3ExGyQaAoNSqsSUY=%7B%22sid%22%3A%20%22757fcc49-fe4e-5985-bc10-5a0f6ef6d6ba%22%2C%20%22vd%22%3A%206%2C%20%22stt%22%3A%201175754%2C%20%22dr%22%3A%20271898%2C%20%22expires%22%3A%201713251127320%2C%20%22ct%22%3A%201713249327320%7D; Hm_lpvt_1605442054faab140873b7c14e40c707=1713249327; Hm_lpvt_4820535acfded9186d46b7ae0c829918=1713249328");
    7. httpClient.DefaultRequestHeaders.Add("Sec-Ch-Ua", "\"Google Chrome\";v=\"123\", \"Not:A-Brand\";v=\"8\", \"Chromium\";v=\"123\"");
    8. httpClient.DefaultRequestHeaders.Add("Sec-Ch-Ua-Mobile", "?0");
    9. httpClient.DefaultRequestHeaders.Add("Sec-Ch-Ua-Platform", "\"Windows\"");
    10. httpClient.DefaultRequestHeaders.Add("Sec-Fetch-Dest", "document");
    11. httpClient.DefaultRequestHeaders.Add("Sec-Fetch-Mode", "navigate");
    12. httpClient.DefaultRequestHeaders.Add("Sec-Fetch-Site", "none");
    13. httpClient.DefaultRequestHeaders.Add("Sec-Fetch-User", "?1");
    14. httpClient.DefaultRequestHeaders.Add("Upgrade-Insecure-Requests", "1");
    1. //匹配
    2. var currencyElement =
    3. htmlDocument.DocumentNode.SelectSingleNode("//div[@id='hr_app_cid_1']");
    4. if (currencyElement != null)
    5. {
    6. // 查找包含人民币价格的元素
    7. var priceElement = currencyElement.SelectSingleNode(".//div[@class='virtual overflow']");
    8. if (priceElement != null)
    9. {
    10. // 提取人民币价格
    11. var chinesePrice = priceElement.InnerText.Trim();
    12. }
    13. }

    美元价格也是类似的规则,运行结果

     7.✨获取CSDN热点

    模拟请求,修改规则

    1. var httpClient = new HttpClient();
    2. var referer = "xxx";
    3. httpClient.DefaultRequestHeaders.Add("referer", referer);
    4. httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
    5. var response = await httpClient.GetAsync("xxx/");
    6. response.EnsureSuccessStatusCode();
    7. var responseBody = await response.Content.ReadAsStringAsync();
    8. var contentList = new Liststring, string, string>>();
    9. var htmlDocument = new HtmlDocument();
    10. htmlDocument.LoadHtml(responseBody);
    11. // var itemNodes = htmlDocument.DocumentNode.SelectNodes("//div[contains(@class, 'headswiper-item')]");
    12. var items = htmlDocument.DocumentNode.SelectNodes("//div[@class='headswiper-item']");
    13. if (items != null)
    14. {
    15. foreach (var item in items)
    16. {
    17. var titleNode = item.SelectSingleNode(".//a[@class='title']");
    18. var nameNode = item.SelectSingleNode(".//p[@class='name']");
    19. var linkNode = item.SelectSingleNode(".//a[@class='title']");
    20. if (titleNode != null && nameNode != null && linkNode != null)
    21. {
    22. var title = titleNode.InnerText.Trim();
    23. var name = nameNode.InnerText.Trim();
    24. var link = linkNode.GetAttributeValue("href", "");
    25. contentList.Add(new Tuple<string, string, string>(title, name, link));
    26. }
    27. }
    28. }
    29. return View(contentList);

    or 使用正则

    1. // 使用正则表达式匹配所有符合条件的 div 元素内容
    2. var regex = new Regex(@"
      ", RegexOptions.Singleline);
    3. var matches = regex.Matches(responseBody);
    4. foreach (Match match in matches)
    5. {
    6. //去除Vuehtml
    7. string s1temp = match.Groups[1].Value.Trim();
    8. int i1 = s1temp.IndexOf(">")+1;
    9. int i2 = s1temp.Length - i1 - 1;
    10. string hotstr = s1temp.Substring(i1, i2);
    11. contentList.Add(hotstr);
    12. }

    运行结果

    8.🎃 获取编程语言排行榜

    1. var response = await httpClient.GetAsync("xxx");
    2. response.EnsureSuccessStatusCode();
    3. var htmlContent = await response.Content.ReadAsStringAsync();
    4. var htmlDocument = new HtmlDocument();
    5. htmlDocument.LoadHtml(htmlContent);
    6. // 获取表格内容
    7. var table = htmlDocument.DocumentNode.SelectSingleNode("//table[@class='w-min min-w-full table-fixed divide-y-2 divide-gray-200 text-sm dark:divide-gray-700']");
    8. ViewBag.TableContent = table?.OuterHtml;

    9.🖥️获取小破站热门

    1. var httpClient = new HttpClient();
    2. var referer = "https://xxx";
    3. httpClient.DefaultRequestHeaders.Add("Referer", referer);
    4. httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
    5. var response = await httpClient.GetAsync("https://xxx");
    6. response.EnsureSuccessStatusCode();
    7. var responseBody = await response.Content.ReadAsStringAsync();
    8. var jsonRes = Newtonsoft.Json.JsonConvert.DeserializeObject<dynamic>(responseBody);
    9. var tempArr = new List();
    10. int itid = 1;
    11. foreach (var itemj in jsonRes.data.list)
    12. {
    13. string tt = itemj.keyword.ToString();
    14. // string hot = itemj.hot_value.ToString();
    15. tempArr.Add(new MItem
    16. {
    17. Index = itid++,
    18. Title = tt,
    19. Hot = "",
    20. Url = "https://search.bilibili.com/all?keyword=" + tt+ "&order=click",
    21. MobileUrl = "https://search.bilibili.com/all?keyword=" + tt+ "&order=click"
    22. });
    23. }
    24. var md = new MData
    25. {
    26. Success = true,
    27. Title = "小破站",
    28. Subtitle = "热搜榜",
    29. UpdateTime = DateTime.Now.ToString("yyyy-MM-dd hh:mm:ss"),
    30. Data = tempArr
    31. };
    1. public class MData
    2. {
    3. public bool Success { get; set; }
    4. public string Title { get; set; }
    5. public string Subtitle { get; set; }
    6. public string UpdateTime { get; set; }
    7. public List Data { get; set; }
    8. }
    1. public class MItem
    2. {
    3. public int Index { get; set; }
    4. public string Title { get; set; }
    5. public string Hot { get; set; }
    6. public string Url { get; set; }
    7. public string MobileUrl { get; set; }
    8. }

    10.⭐获取某音热门 

    1. var httpClient = new HttpClient();
    2. var referer = "xxx";
    3. httpClient.DefaultRequestHeaders.Add("referer", referer);
    4. httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
    5. var response = await httpClient.GetAsync("xxx");
    6. response.EnsureSuccessStatusCode();
    7. var responseBody = await response.Content.ReadAsStringAsync();
    8. var jsonRes = Newtonsoft.Json.JsonConvert.DeserializeObject<dynamic>(responseBody);
    9. var tempArr = new List();
    10. int itid = 1;
    11. foreach (var itemj in jsonRes.word_list)
    12. {
    13. string tt = itemj.word.ToString();
    14. string hot =itemj.hot_value.ToString();
    15. tempArr.Add(new MItem
    16. {
    17. Index = itid++,
    18. Title = tt,
    19. Hot = hot,
    20. Url = "https://www.douyin.com/search/" + tt,
    21. MobileUrl = "https://www.douyin.com/search/" + tt
    22. });
    23. }
    24. var douyinData = new MData
    25. {
    26. Success = true,
    27. Title = "某音",
    28. Subtitle = "热搜榜",
    29. UpdateTime = DateTime.Now.ToString("yyyy-MM-dd hh:mm:ss"),
    30. Data = tempArr
    31. };

    注意:如果目标有IP限制、Cookie、签名等,则需编写相应的对策 。

    此外,还可以通过WinFrom WebBowser控件实现加载完DOM再匹配信息实现爬取效果。

    这里只做抛砖引玉,后续是否入狱自行探索,请勿使用此技术做违法的事情!!!💀 

    在使用爬虫技术时,必须遵守相关法律法规,尊重网站经营者的权益,避免对网站或系统造成干扰或破坏。同时,对于涉及公民个人信息的爬虫行为,应格外谨慎,确保不侵犯他人的隐私权。

    END

  • 相关阅读:
    Angular RouterModule.forRoot(ROUTES) 和 forChild(ROUTES)的区别
    微信小程序_20,使用第三方npm包
    HTML+CSS网页设计期末课程大作业 【茶叶文化网站设计题材】web前端开发技术 web课程设计 网页规划与设计
    AVL的代码剖析(c++)
    nginx子域名配置单独的日志文件
    MySQL面试题全解析:准备面试所需的关键知识点和实战经验
    水利水电工程资质怎么办理,水利水电工程施工总承包三级资质办理条件有哪些
    论文速览 | arxiv 2023, 马氏距离感知训练在分布外检测中的应用
    NoSQL之Redis配置与优化
    多姿多彩的编程世界之配色方案
  • 原文地址:https://blog.csdn.net/lmnotlm/article/details/137811018