京东店铺公司名爬虫

内容仅供学习参考，如有侵权联系删除

先通过京东非自营的店铺名拿到的公司名，再通过公司名称去其他平台拿到联系方式（代码省略）


from aioscrapy.spiders import Spider
from aioscrapy.http import Request, FormRequest
import ddddocr
import re
import random

from loguru import logger


class JingDongSpider(Spider):
    name = 'products:jd'

    custom_settings = {
        'CONCURRENT_REQUESTS': 4,
        # 'DOWNLOAD_DELAY': 0.5,
        'DOWNLOAD_TIMEOUT': 10,
        'RETRY_TIMES': 5,
        'HTTPERROR_ALLOWED_CODES': [503],
        'COOKIES_ENABLED': False,
        'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.redis.RFPDupeFilter',  # 过滤方法
        # 'LOG_LEVEL': 'DEBUG'
    }

    ocr = ddddocr.DdddOcr(show_ad=False, use_gpu=True)

    async def start_requests(self):

        yield Request(
            url=f"https://mall.jd.com/index-11111111.html?from=pc",
            method='GET',
            dont_filter=False,
            # fingerprint=str(i),
            # meta={"shop_id": str(i)},
            priority=500)

    async def parse(self, response):
        """店铺首页"""
        title = response.xpath('//title/text()').get() or ''
        shop_id = str(response.meta['shop_id'])
        if '您所访问的页面不存在' in str(title) or len(response.text) < 25000:
            logger.info(f"{shop_id}")
            return

        logger.info(title.strip())
        product_list = self.get_product_items(response)
        urls = re.findall(r"//\w+\.jd\.com/view_search-\d+-\d+-\d+-\d+-\d+-\d+\.html", response.text)

        yield Request(
            url=f"https://mall.jd.com/sys/vc/createVerifyCode.html?random={random.random()}",
            method='GET',
            callback=self.parse_img_code,
            dont_filter=True,
            meta={
                "data": {"product_url": 'https:' + urls[0] if urls else '',
                         "categorys": self.get_category(response),
                         "product_list": product_list,
                         # "shop_url": response.url,
                         "shop_id": shop_id}
            },
            priority=500)

    async def parse_img_code(self, response):
        """验证码"""
        code = self.ocr.classification(response.body)
        cookie = dict(response.cookies.items())
        shop_id = response.meta["data"]["shop_id"]
        if not code or not cookie:
            return

        yield FormRequest(
            url=f'https://mall.jd.com/showLicence-{shop_id}.html',
            method='POST',
            formdata={"verifyCode": str(code)},
            cookies=cookie,
            meta={"data": response.meta["data"]},
            callback=self.parse_shop_detail,
            dont_filter=True,
            priority=400)

    async def parse_shop_detail(self, response):
        """ 解析店铺详情
        """
        company = response.xpath(
            '//*[contains(.,"企业名称：")]/following-sibling::span[position()=1]/text()').get() or ''
        shop_name = response.xpath(
            '//*[contains(.,"店铺名称：")]/following-sibling::span[position()=1]//text()').get() or ''
        shop_url = response.xpath('//*[contains(.,"店铺网址：")]/following-sibling::span[position()=1]//text()').get()
        # legal_person = response.xpath( '//*[contains(.,"法定代表人姓名：")]/following-sibling::span[position()=1]//text()').get()
        # business_scope = response.xpath( '//*[contains(.,"营业执照经营范围：")]/following-sibling::span[position()=1]//text()').get()
        license = response.xpath('//img[@class="qualification-img"]/@src').get() or ''
        if not company or '测试' in shop_name or '测试' in company:
            if not company:
                logger.info(f"无公司： {response.url}")
            else:
                logger.info(f" {shop_name} => {company}")
            return
        else:
            logger.info(company)

        data = response.meta['data']
        data['company'] = company
        data['shop_name'] = shop_name

        items = dict(company=company,
                     shop_name=shop_name,
                     shop_url='https:' + shop_url if shop_url else response.url,
                     product_url=data['product_url'],
                     shop_id=data['shop_id'],
                     push_kafka_status=0,
                     license='https:' + license if license else '',

                     )

        if len(data['product_list']) < 1:
            if data['product_url']:
                yield Request(
                    url=data['product_url'],
                    method='GET',
                    meta={"data": data},
                    callback=self.parse_product,
                    dont_filter=True,
                    priority=300)
            else:
                logger.warning(f"获取不到产品链接：{response.url}")
                items.pop('product_url')
            yield items

        else:
            product_list = []
            for item in data['product_list']:
                item['entityId'] = company
                product_list.append(item)

            yield dict(
                source='jd.com',
                ocid='',
                entityId=company,
                product=product_list,
            )
            items['push_kafka_status'] = 1
            yield items

    async def parse_product(self, response):
        """解析产品页"""
        data = response.meta['data']
        shop_name = data['shop_name']
        company = data['company']
        categorys = data['categorys']

        product_list = self.get_product_items(response, shop_name, company, categorys, data['product_url'])

        if product_list:
            yield dict(
                source='jd.com',
                ocid='',
                entityId=company,
                product=product_list,
            )
            logger.info(f"成功： {company} => {data['shop_id']}")

            yield dict(
                company=company,
                shop_id=data['shop_id'],
                push_kafka_status=1,

            )
        else:
            logger.error(f"{response.url} => {data['shop_id']}")

    def get_product_items(self, response, shop_name='', company='', categorys='', shop_url='') -> list:
        ul = response.xpath('//li[@class="jSubObject"] | //li[@class="jSubObject gl-item"] | //div[@class="jItem"]')

        product_list = []
        for li in ul[:10]:
            title = li.xpath('.//div[@class="jDesc"]/a/@title').get() or ''
            # price = li.xpath('.//span[@class="jdNum"]/text()').get()
            img = str(li.xpath('.//div[@class="jPic"]//img/@src').get() or '').replace('s350x350', '')
            if not title and not img:
                continue
            if img:
                img = re.sub(r"/n[23456789]/", "/n1/", img)
                img = 'https:' + img

            item_i = {}
            item_i["entityId"] = company
            item_i["productPic"] = img.replace('s350x350', '')
            item_i["productName"] = title  # 产品名称
            item_i["productCategory"] = ""  # 产品分类
            item_i["productKeyword"] = ""  # 产品关键词
            item_i["productPrice"] = ""  # 产品价格
            item_i["mainProducts"] = categorys  # 主营产品
            item_i["listingPlatform"] = "京东"
            item_i["productShopName"] = shop_name  # 产品所属店铺名
            item_i["dataLink"] = shop_url or response.url  # 店铺链接
            product_list.append(item_i)

        return product_list

    @staticmethod
    def get_category(response) -> str:
        categorys = response.xpath(
            '//ul[@class="menu-list"]/li[@class="menu"]/a/text() | //div[@class="abs"]//div[@class="ins abs hdur_2"]/a/text()').getall()
        category = []
        for i in categorys:
            if '首页' in i or '全部' in i or '所有' in i or '问题' in i or '指导' in i or '售后' in i or '撰文' in i:
                continue
            category.append(i)
        return ','.join(category)


if __name__ == '__main__':
    JingDongSpider.start()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213

最后的数据

在这里插入图片描述

本内容仅限用于学习参考，不得用于商业目的。如有版权问题，请联系我们删除，谢谢！
欢迎一起学习讨论Q540513871

相关阅读:
背包问题总结——剑指offer二专项101-104
uniapp 富文本以及移动端富文本的展示问题
 【OpenCV 例程300篇】238. OpenCV 中的 Harris 角点检测
 【多线程案例】定时器
 vulfocus靶场名称: apache-cve_2021_41773/apache-cve_2021_42013
Decimal.ToString()堆栈溢出异常
 华为云云耀云服务器L实例评测｜centos7.9 配置python虚拟环境运行django
python实现udp通信代码
 武林新秀（一）`git init` 初始化一个新的Git仓库
 AI赋能写作：AI大模型高效写作一本通
原文地址：https://blog.csdn.net/qq_40279560/article/details/133686914