目录
爬虫可以用于外汇,期货,基金,货币,比分,电商,文章等信息的采集。通过数据分析,竞品分析,为商业决策提供数据支持。
金价暴涨,抓取下今日的金价。
- try
- {
- // 创建HttpClient实例
- using (var httpClient = new HttpClient())
- {
- //模拟User-Agent
- httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
- // 发送GET请求并获取响应 xxx.com是某网站的页面~(保护)
- var response = await httpClient.GetAsync("https://xxx.com");
- response.EnsureSuccessStatusCode();
- var htmlContent = await response.Content.ReadAsStringAsync();
-
-
- }
- }
- catch (HttpRequestException ex)
- {
- // 处理请求异常
- ViewBag.Error = "Failed to retrieve price data: " + ex.Message;
- }
- // 生成随机 User-Agent
- private string GetUserAgent()
- {
- string[] userAgents = new string[]
- {
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
- //
- };
-
- Random random = new Random();
- int index = random.Next(userAgents.Length);
- return userAgents[index];
- }
(HtmlAgilityPack)提取价格
- // 使用HtmlAgilityPack解析HTML
- var htmlDocument = new HtmlDocument();
- htmlDocument.LoadHtml(htmlContent);
-
- // 使用XPath表达式选择指定的
元素- var trElement = htmlDocument.DocumentNode.SelectSingleNode("//tr[contains(@class, 'border_ea') and contains(@class, 'noTop_border')]");
-
- if (trElement != null)
- {
- // 获取包含价格的
元素- var priceCell = trElement.SelectSingleNode(".//td[@align='center'][2]");
-
- if (priceCell != null)
- {
- // 提取价格
- string price = priceCell.InnerText.Trim();
-
- // 将价格传递给视图
- ViewBag.Price = price;
- }
- else
- {
- ViewBag.Error = "Price cell not found in table row.";
- }
- }
- else
- {
- ViewBag.Error = "Table row not found.";
- }
也可以用正则表达式提取
3.👌前端Price显示
- <h1>Au99.99 Price: @ViewBag.Priceh1>
- <h2>@ViewBag.Errorh2>
4.🌱运行实例 获取金价Au
也可以通过修改规则取实时Pt的价格
- // 使用XPath表达式选择指定的
元素- var trElements = htmlDocument.DocumentNode.SelectNodes("//tr[contains(@class, 'border_ea') and contains(@class, 'noTop_border ')]");
-
- if (trElements != null)
- {
- foreach (var trElement in trElements)
- {
- // 获取包含产品名称和价格的
元素- var tdElements = trElement.SelectNodes("./td");
- if (tdElements != null && tdElements.Count >= 5)
- {
- // 获取产品名称、最新价、最高价、最低价和今开盘价格
- string productName = tdElements[0].InnerText.Trim();
- string latestPrice = tdElements[1].InnerText.Trim();
- string highestPrice = tdElements[2].InnerText.Trim();
- string lowestPrice = tdElements[3].InnerText.Trim();
- string openingPrice = tdElements[4].InnerText.Trim();
-
- // 检查产品名称是否为Au99.99或Pt99.95
- if (productName == "Au99.99")
- {
- // 将Au99.99价格信息传递给视图
- ViewBag.AuLatestPrice = latestPrice;
- ViewBag.AuHighestPrice = highestPrice;
- ViewBag.AuLowestPrice = lowestPrice;
- ViewBag.AuOpeningPrice = openingPrice;
- }
- else if (productName == "Pt99.95")
- {
- // 将Pt99.95价格信息传递给视图
- ViewBag.PtLatestPrice = latestPrice;
- ViewBag.PtHighestPrice = highestPrice;
- ViewBag.PtLowestPrice = lowestPrice;
- ViewBag.PtOpeningPrice = openingPrice;
- }
- }
- }
- }
结果如下:
5.🧾使用正则表达式解析
通过httpClient请求
- public async Task<ActionResult> Index()
- {
- // 获取当前时间的时间戳
- long timestamp = DateTimeOffset.Now.ToUnixTimeMilliseconds();
-
- // 构建URL
- string url = $"http://www.xxx.cn/xx.js?t={timestamp}";
-
- // 创建HttpClient实例
- using (var httpClient = new HttpClient())
- {
- // 设置 User-Agent
- httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
-
- try
- {
- // 发送GET请求
- var response = await httpClient.GetAsync(url);
- response.EnsureSuccessStatusCode();
-
- // 读取返回的数据
- var responseData = await response.Content.ReadAsStringAsync();
-
- // 解析返回的数据
- var price = ParseGoldPrice(responseData);
-
- // 将价格传递给视图
- ViewBag.Price = price;
- }
- catch (HttpRequestException ex)
- {
- // 处理请求异常
- ViewBag.Error = "Failed to retrieve gold price data.";
- }
- }
-
- // 返回视图
- return View();
- }
处理返回价格
通过正则表达式匹配数据
- private decimal ParseGoldPrice(string responseData)
- {
-
-
- string price = "";
- decimal price2 = 0;
-
- string data = responseData;//
-
- // 匹配价格的正则表达式
- string pattern = @"var hq_str_gds_AUTD=""([^""]+)"";";
-
- // 使用正则表达式匹配数据
- Match match = Regex.Match(data, pattern);
-
- if (match.Success)
- {
- // 获取匹配到的价格数据
- string priceData = match.Groups[1].Value;
-
- // 使用逗号分割数据,取第一个元素作为价格
- string[] priceParts = priceData.Split(',');
- price = priceParts[0];
-
- // 将字符串价格转换为decimal类型
- price2 = decimal.Parse(price);
-
- // 输出提取到的价格
- Console.WriteLine("Gold Price: " + price2);
- }
- else
- {
- Console.WriteLine("Price not found in data.");
- }
-
- // decimal price = decimal.Parse(priceString);
-
- return price2;
- }
6.💫获取BTC价格
Headers模拟cookie获取BTC价格
-
- httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
-
- httpClient.DefaultRequestHeaders.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7");
- httpClient.DefaultRequestHeaders.Add("Accept-Language", "zh-CN,zh;q=0.9");
- httpClient.DefaultRequestHeaders.Add("Cache-Control", "max-age=0");
- //cookie
- httpClient.DefaultRequestHeaders.Add("Cookie", "__51uvsct__3ExGyQaAoNSqsSUY=1; __51vcke__3ExGyQaAoNSqsSUY=c91184d5-8826-5ea8-8ddc-a0f3b85c9470; __51vuft__3ExGyQaAoNSqsSUY=1713248151570; PHPSESSID=a6ff05p79bbf3ot4ohphein0e1; Hm_lvt_1605442054faab140873b7c14e40c707=1713248152; Hm_lvt_4820535acfded9186d46b7ae0c829918=1713248152; __vtins__3ExGyQaAoNSqsSUY=%7B%22sid%22%3A%20%22757fcc49-fe4e-5985-bc10-5a0f6ef6d6ba%22%2C%20%22vd%22%3A%206%2C%20%22stt%22%3A%201175754%2C%20%22dr%22%3A%20271898%2C%20%22expires%22%3A%201713251127320%2C%20%22ct%22%3A%201713249327320%7D; Hm_lpvt_1605442054faab140873b7c14e40c707=1713249327; Hm_lpvt_4820535acfded9186d46b7ae0c829918=1713249328");
- httpClient.DefaultRequestHeaders.Add("Sec-Ch-Ua", "\"Google Chrome\";v=\"123\", \"Not:A-Brand\";v=\"8\", \"Chromium\";v=\"123\"");
- httpClient.DefaultRequestHeaders.Add("Sec-Ch-Ua-Mobile", "?0");
- httpClient.DefaultRequestHeaders.Add("Sec-Ch-Ua-Platform", "\"Windows\"");
- httpClient.DefaultRequestHeaders.Add("Sec-Fetch-Dest", "document");
- httpClient.DefaultRequestHeaders.Add("Sec-Fetch-Mode", "navigate");
- httpClient.DefaultRequestHeaders.Add("Sec-Fetch-Site", "none");
- httpClient.DefaultRequestHeaders.Add("Sec-Fetch-User", "?1");
- httpClient.DefaultRequestHeaders.Add("Upgrade-Insecure-Requests", "1");
-
- //匹配
- var currencyElement =
- htmlDocument.DocumentNode.SelectSingleNode("//div[@id='hr_app_cid_1']");
- if (currencyElement != null)
- {
- // 查找包含人民币价格的元素
- var priceElement = currencyElement.SelectSingleNode(".//div[@class='virtual overflow']");
- if (priceElement != null)
- {
- // 提取人民币价格
- var chinesePrice = priceElement.InnerText.Trim();
- }
-
- }
美元价格也是类似的规则,运行结果
7.✨获取CSDN热点
模拟请求,修改规则
- var httpClient = new HttpClient();
- var referer = "xxx";
- httpClient.DefaultRequestHeaders.Add("referer", referer);
- httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
- var response = await httpClient.GetAsync("xxx/");
- response.EnsureSuccessStatusCode();
- var responseBody = await response.Content.ReadAsStringAsync();
-
- var contentList = new List
string, string, string>>(); -
- var htmlDocument = new HtmlDocument();
- htmlDocument.LoadHtml(responseBody);
-
- // var itemNodes = htmlDocument.DocumentNode.SelectNodes("//div[contains(@class, 'headswiper-item')]");
- var items = htmlDocument.DocumentNode.SelectNodes("//div[@class='headswiper-item']");
-
- if (items != null)
- {
- foreach (var item in items)
- {
- var titleNode = item.SelectSingleNode(".//a[@class='title']");
- var nameNode = item.SelectSingleNode(".//p[@class='name']");
- var linkNode = item.SelectSingleNode(".//a[@class='title']");
-
-
- if (titleNode != null && nameNode != null && linkNode != null)
- {
- var title = titleNode.InnerText.Trim();
- var name = nameNode.InnerText.Trim();
- var link = linkNode.GetAttributeValue("href", "");
- contentList.Add(new Tuple<string, string, string>(title, name, link));
- }
- }
- }
- return View(contentList);
or 使用正则
-
- // 使用正则表达式匹配所有符合条件的 div 元素内容
- var regex = new Regex(@"", RegexOptions.Singleline);
- var matches = regex.Matches(responseBody);
- foreach (Match match in matches)
- {
- //去除Vuehtml
- string s1temp = match.Groups[1].Value.Trim();
- int i1 = s1temp.IndexOf(">")+1;
- int i2 = s1temp.Length - i1 - 1;
- string hotstr = s1temp.Substring(i1, i2);
-
- contentList.Add(hotstr);
- }
运行结果
8.🎃 获取编程语言排行榜
- var response = await httpClient.GetAsync("xxx");
- response.EnsureSuccessStatusCode();
- var htmlContent = await response.Content.ReadAsStringAsync();
-
- var htmlDocument = new HtmlDocument();
- htmlDocument.LoadHtml(htmlContent);
-
- // 获取表格内容
- var table = htmlDocument.DocumentNode.SelectSingleNode("//table[@class='w-min min-w-full table-fixed divide-y-2 divide-gray-200 text-sm dark:divide-gray-700']");
-
- ViewBag.TableContent = table?.OuterHtml;
9.🖥️获取小破站热门
- var httpClient = new HttpClient();
- var referer = "https://xxx";
- httpClient.DefaultRequestHeaders.Add("Referer", referer);
- httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
- var response = await httpClient.GetAsync("https://xxx");
- response.EnsureSuccessStatusCode();
- var responseBody = await response.Content.ReadAsStringAsync();
- var jsonRes = Newtonsoft.Json.JsonConvert.DeserializeObject<dynamic>(responseBody);
- var tempArr = new List
(); -
- int itid = 1;
- foreach (var itemj in jsonRes.data.list)
- {
- string tt = itemj.keyword.ToString();
- // string hot = itemj.hot_value.ToString();
- tempArr.Add(new MItem
- {
- Index = itid++,
- Title = tt,
- Hot = "",
- Url = "https://search.bilibili.com/all?keyword=" + tt+ "&order=click",
- MobileUrl = "https://search.bilibili.com/all?keyword=" + tt+ "&order=click"
- });
- }
-
- var md = new MData
- {
- Success = true,
- Title = "小破站",
- Subtitle = "热搜榜",
- UpdateTime = DateTime.Now.ToString("yyyy-MM-dd hh:mm:ss"),
- Data = tempArr
- };
- public class MData
- {
- public bool Success { get; set; }
- public string Title { get; set; }
- public string Subtitle { get; set; }
- public string UpdateTime { get; set; }
- public List
Data { get; set; } - }
- public class MItem
- {
- public int Index { get; set; }
- public string Title { get; set; }
- public string Hot { get; set; }
- public string Url { get; set; }
- public string MobileUrl { get; set; }
- }
-
10.⭐获取某音热门
- var httpClient = new HttpClient();
- var referer = "xxx";
- httpClient.DefaultRequestHeaders.Add("referer", referer);
- httpClient.DefaultRequestHeaders.Add("User-Agent", GetUserAgent());
- var response = await httpClient.GetAsync("xxx");
- response.EnsureSuccessStatusCode();
- var responseBody = await response.Content.ReadAsStringAsync();
- var jsonRes = Newtonsoft.Json.JsonConvert.DeserializeObject<dynamic>(responseBody);
- var tempArr = new List
(); -
- int itid = 1;
- foreach (var itemj in jsonRes.word_list)
- {
- string tt = itemj.word.ToString();
- string hot =itemj.hot_value.ToString();
-
-
- tempArr.Add(new MItem
- {
- Index = itid++,
- Title = tt,
- Hot = hot,
- Url = "https://www.douyin.com/search/" + tt,
- MobileUrl = "https://www.douyin.com/search/" + tt
- });
- }
-
- var douyinData = new MData
- {
- Success = true,
- Title = "某音",
- Subtitle = "热搜榜",
- UpdateTime = DateTime.Now.ToString("yyyy-MM-dd hh:mm:ss"),
- Data = tempArr
- };
注意:如果目标有IP限制、Cookie、签名等,则需编写相应的对策 。
此外,还可以通过WinFrom WebBowser控件实现加载完DOM再匹配信息实现爬取效果。
这里只做抛砖引玉,后续是否入狱自行探索,请勿使用此技术做违法的事情!!!💀
在使用爬虫技术时,必须遵守相关法律法规,尊重网站经营者的权益,避免对网站或系统造成干扰或破坏。同时,对于涉及公民个人信息的爬虫行为,应格外谨慎,确保不侵犯他人的隐私权。
END
-
相关阅读:
Angular RouterModule.forRoot(ROUTES) 和 forChild(ROUTES)的区别
微信小程序_20,使用第三方npm包
HTML+CSS网页设计期末课程大作业 【茶叶文化网站设计题材】web前端开发技术 web课程设计 网页规划与设计
AVL的代码剖析(c++)
nginx子域名配置单独的日志文件
MySQL面试题全解析:准备面试所需的关键知识点和实战经验
水利水电工程资质怎么办理,水利水电工程施工总承包三级资质办理条件有哪些
论文速览 | arxiv 2023, 马氏距离感知训练在分布外检测中的应用
NoSQL之Redis配置与优化
多姿多彩的编程世界之配色方案
-
原文地址:https://blog.csdn.net/lmnotlm/article/details/137811018