• 京东店铺公司名爬虫


    内容仅供学习参考,如有侵权联系删除

    先通过京东非自营的店铺名拿到的公司名,再通过公司名称去其他平台拿到联系方式(代码省略)

    
    from aioscrapy.spiders import Spider
    from aioscrapy.http import Request, FormRequest
    import ddddocr
    import re
    import random
    
    from loguru import logger
    
    
    class JingDongSpider(Spider):
        name = 'products:jd'
    
        custom_settings = {
            'CONCURRENT_REQUESTS': 4,
            # 'DOWNLOAD_DELAY': 0.5,
            'DOWNLOAD_TIMEOUT': 10,
            'RETRY_TIMES': 5,
            'HTTPERROR_ALLOWED_CODES': [503],
            'COOKIES_ENABLED': False,
            'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.redis.RFPDupeFilter',  # 过滤方法
            # 'LOG_LEVEL': 'DEBUG'
        }
    
        ocr = ddddocr.DdddOcr(show_ad=False, use_gpu=True)
    
        async def start_requests(self):
    
            yield Request(
                url=f"https://mall.jd.com/index-11111111.html?from=pc",
                method='GET',
                dont_filter=False,
                # fingerprint=str(i),
                # meta={"shop_id": str(i)},
                priority=500)
    
        async def parse(self, response):
            """店铺首页"""
            title = response.xpath('//title/text()').get() or ''
            shop_id = str(response.meta['shop_id'])
            if '您所访问的页面不存在' in str(title) or len(response.text) < 25000:
                logger.info(f"{shop_id}")
                return
    
            logger.info(title.strip())
            product_list = self.get_product_items(response)
            urls = re.findall(r"//\w+\.jd\.com/view_search-\d+-\d+-\d+-\d+-\d+-\d+\.html", response.text)
    
            yield Request(
                url=f"https://mall.jd.com/sys/vc/createVerifyCode.html?random={random.random()}",
                method='GET',
                callback=self.parse_img_code,
                dont_filter=True,
                meta={
                    "data": {"product_url": 'https:' + urls[0] if urls else '',
                             "categorys": self.get_category(response),
                             "product_list": product_list,
                             # "shop_url": response.url,
                             "shop_id": shop_id}
                },
                priority=500)
    
        async def parse_img_code(self, response):
            """验证码"""
            code = self.ocr.classification(response.body)
            cookie = dict(response.cookies.items())
            shop_id = response.meta["data"]["shop_id"]
            if not code or not cookie:
                return
    
            yield FormRequest(
                url=f'https://mall.jd.com/showLicence-{shop_id}.html',
                method='POST',
                formdata={"verifyCode": str(code)},
                cookies=cookie,
                meta={"data": response.meta["data"]},
                callback=self.parse_shop_detail,
                dont_filter=True,
                priority=400)
    
        async def parse_shop_detail(self, response):
            """ 解析店铺详情
            """
            company = response.xpath(
                '//*[contains(.,"企业名称:")]/following-sibling::span[position()=1]/text()').get() or ''
            shop_name = response.xpath(
                '//*[contains(.,"店铺名称:")]/following-sibling::span[position()=1]//text()').get() or ''
            shop_url = response.xpath('//*[contains(.,"店铺网址:")]/following-sibling::span[position()=1]//text()').get()
            # legal_person = response.xpath( '//*[contains(.,"法定代表人姓名:")]/following-sibling::span[position()=1]//text()').get()
            # business_scope = response.xpath( '//*[contains(.,"营业执照经营范围:")]/following-sibling::span[position()=1]//text()').get()
            license = response.xpath('//img[@class="qualification-img"]/@src').get() or ''
            if not company or '测试' in shop_name or '测试' in company:
                if not company:
                    logger.info(f"无公司: {response.url}")
                else:
                    logger.info(f" {shop_name} => {company}")
                return
            else:
                logger.info(company)
    
            data = response.meta['data']
            data['company'] = company
            data['shop_name'] = shop_name
    
            items = dict(company=company,
                         shop_name=shop_name,
                         shop_url='https:' + shop_url if shop_url else response.url,
                         product_url=data['product_url'],
                         shop_id=data['shop_id'],
                         push_kafka_status=0,
                         license='https:' + license if license else '',
    
                         )
    
            if len(data['product_list']) < 1:
                if data['product_url']:
                    yield Request(
                        url=data['product_url'],
                        method='GET',
                        meta={"data": data},
                        callback=self.parse_product,
                        dont_filter=True,
                        priority=300)
                else:
                    logger.warning(f"获取不到产品链接:{response.url}")
                    items.pop('product_url')
                yield items
    
            else:
                product_list = []
                for item in data['product_list']:
                    item['entityId'] = company
                    product_list.append(item)
    
                yield dict(
                    source='jd.com',
                    ocid='',
                    entityId=company,
                    product=product_list,
                )
                items['push_kafka_status'] = 1
                yield items
    
        async def parse_product(self, response):
            """解析产品页"""
            data = response.meta['data']
            shop_name = data['shop_name']
            company = data['company']
            categorys = data['categorys']
    
            product_list = self.get_product_items(response, shop_name, company, categorys, data['product_url'])
    
            if product_list:
                yield dict(
                    source='jd.com',
                    ocid='',
                    entityId=company,
                    product=product_list,
                )
                logger.info(f"成功: {company} => {data['shop_id']}")
    
                yield dict(
                    company=company,
                    shop_id=data['shop_id'],
                    push_kafka_status=1,
    
                )
            else:
                logger.error(f"{response.url} => {data['shop_id']}")
    
        def get_product_items(self, response, shop_name='', company='', categorys='', shop_url='') -> list:
            ul = response.xpath('//li[@class="jSubObject"] | //li[@class="jSubObject gl-item"] | //div[@class="jItem"]')
    
            product_list = []
            for li in ul[:10]:
                title = li.xpath('.//div[@class="jDesc"]/a/@title').get() or ''
                # price = li.xpath('.//span[@class="jdNum"]/text()').get()
                img = str(li.xpath('.//div[@class="jPic"]//img/@src').get() or '').replace('s350x350', '')
                if not title and not img:
                    continue
                if img:
                    img = re.sub(r"/n[23456789]/", "/n1/", img)
                    img = 'https:' + img
    
                item_i = {}
                item_i["entityId"] = company
                item_i["productPic"] = img.replace('s350x350', '')
                item_i["productName"] = title  # 产品名称
                item_i["productCategory"] = ""  # 产品分类
                item_i["productKeyword"] = ""  # 产品关键词
                item_i["productPrice"] = ""  # 产品价格
                item_i["mainProducts"] = categorys  # 主营产品
                item_i["listingPlatform"] = "京东"
                item_i["productShopName"] = shop_name  # 产品所属店铺名
                item_i["dataLink"] = shop_url or response.url  # 店铺链接
                product_list.append(item_i)
    
            return product_list
    
        @staticmethod
        def get_category(response) -> str:
            categorys = response.xpath(
                '//ul[@class="menu-list"]/li[@class="menu"]/a/text() | //div[@class="abs"]//div[@class="ins abs hdur_2"]/a/text()').getall()
            category = []
            for i in categorys:
                if '首页' in i or '全部' in i or '所有' in i or '问题' in i or '指导' in i or '售后' in i or '撰文' in i:
                    continue
                category.append(i)
            return ','.join(category)
    
    
    if __name__ == '__main__':
        JingDongSpider.start()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180
    • 181
    • 182
    • 183
    • 184
    • 185
    • 186
    • 187
    • 188
    • 189
    • 190
    • 191
    • 192
    • 193
    • 194
    • 195
    • 196
    • 197
    • 198
    • 199
    • 200
    • 201
    • 202
    • 203
    • 204
    • 205
    • 206
    • 207
    • 208
    • 209
    • 210
    • 211
    • 212
    • 213

    最后的数据

    在这里插入图片描述

    本内容仅限用于学习参考,不得用于商业目的。如有版权问题,请联系我们删除,谢谢!
    欢迎一起学习讨论Q540513871

  • 相关阅读:
    背包问题总结——剑指offer二专项101-104
    uniapp 富文本以及移动端富文本的展示问题
    【OpenCV 例程300篇】238. OpenCV 中的 Harris 角点检测
    【多线程案例】定时器
    vulfocus靶场名称: apache-cve_2021_41773/apache-cve_2021_42013
    Decimal.ToString()堆栈溢出异常
    华为云云耀云服务器L实例评测|centos7.9 配置python虚拟环境 运行django
    python实现udp通信代码
    武林新秀(一)`git init` 初始化一个新的Git仓库
    AI赋能写作:AI大模型高效写作一本通
  • 原文地址:https://blog.csdn.net/qq_40279560/article/details/133686914