• python获取抖音号发布数据


    音视频数据获取流程如下
    在这里插入图片描述

    每个抖音账号都有自己的主页地址,且这个地址不用登录。
    人民日报的抖音账号主页地址如下
    https://www.douyin.com/user/MS4wLjABAAAA8U_l6rBzmy7bcy6xOJel4v0RzoR_wfAubGPeJimN__4
    但是在视频列表地址加载的时候会有随机且唯一的key跟在url后面,浏览器F12和postApi都不好使。
    只有使用selenium通过chromedriver直接打开浏览器操作了

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    import time
    from selenium.webdriver import ActionChains
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from datetime import datetime
    import logging
    import json
    import pymysql
    
    # 创建一个logger
    logger = logging.getLogger('my_logger')
    logger.setLevel(logging.DEBUG)  # 设置日志级别
    
    # 创建一个handler,用于写入日志文件
    fh = logging.FileHandler('dy.log', encoding='utf-8')  # 日志文件名
    
    # 定义handler的输出格式
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    
    # 给logger添加handler
    logger.addHandler(fh)
    
    chrome_option = Options()
    
    chrome_option.add_argument("--headless")#无头模式不打开Chrome浏览器
    chrome_option.set_capability("goog:loggingPrefs", {"performance": "ALL"})
    
    driver = webdriver.Chrome(options=chrome_option)
    
    #!!!数据库链接需要调整
    mydatabase = pymysql.connect(host='localhost',
                                 user='root',
                                 password='123456',
                                 database='ry',
                                 charset='utf8mb4')
    cursor = mydatabase.cursor()
    
    cursor.execute(
        'SELECT id,base_media_name,dy_url FROM `media_account_manager2` where `type_id` = 483 AND `status` = 3 and dy_url is not null LIMIT 3'
    )
    result = cursor.fetchall()
    try:
        for row in result:
            m_id = row[0]
            dy_name = row[1]
            dy_url = row[2]
            print('***************************************')
            driver.get(dy_url)  #抖音用户
            time.sleep(15)
            logs = driver.get_log("performance")
            for item in logs:
                StrMsg = item['message']
                data = json.loads(StrMsg)
                method = data['message']['method']
                if method == 'Network.responseReceived':
                    tokenUrl = data['message']['params']['response']['url']
                    if 'post' in tokenUrl:
                        request_id = data['message']['params']['requestId']
                        print(request_id)
                        response = driver.execute_cdp_cmd('Network.getResponseBody',{'requestId': request_id})
                        if response['body']:
                            json_data = json.loads(response['body'])
                            jsonList = json_data['aweme_list']
                            for jsonObj in jsonList:
                                create_time = jsonObj['create_time']
                                dt_object = datetime.fromtimestamp(create_time)
                                # 将datetime对象格式化为YYYYMMDD格式
                                formatted_date = dt_object.strftime('%Y-%m-%d')
                                create_time_str = dt_object.strftime("%Y-%m-%d %H:%M:%S")
                                print('***************************************')
                                print(jsonObj['desc'])
                                print(formatted_date)
                                title = jsonObj['desc']
                                link = jsonObj['share_info']['share_url']
                                text = ''
    
                                now = datetime.now()
                                now_formatted_date = now.strftime("%Y-%m-%d")
    
                                if formatted_date == now_formatted_date:
                                    #print(jsonObj)
                                    print('数据库插入操作**********************************')
                                    print(jsonObj['desc'])
                                    logger.info(jsonObj['desc'])#标题
                                    logger.info(formatted_date)#创建时间
                                    logger.info(jsonObj['share_info']['share_url'])#视频地址share_url
                                    # insert_query = "INSERT INTO `ry`.`media_content`(`title`, `pub_date`, `url`, `content`, `media_id`, `media_name`,`type_id`,`platform`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)"
                                    # # 执行插入操作
                                    # cursor.execute(
                                    #     insert_query,
                                    #     (title, create_time_str, link, text, m_id, dy_name, '483', '抖音自动抓取'))
                                    # # 提交事务
                                    # mydatabase.commit()
    except Exception as e:
        logger.info('******抖音获取发生错误********')
        logger.info(e)
        logger.info(json_data)
        logger.info('******抖音账号:' + dy_name + ',数据获取异常******')
    else:
        logger.info('******抖音数据结束********')
    finally:
        mydatabase.close()
    
  • 相关阅读:
    【随笔】VRRP+MSTP
    Red Teaming Exercises
    【LeetCode】5. Valid Palindrome·有效回文
    解决 PLC QModbusTcpClient 通信自动断开
    代码随想录算法训练营第五十天 | 123.买卖股票的最佳时机III & 188. 买卖股票的最佳时机 IV
    babili-webpack-plugin编译后的代码v+‘‘===1,会被压缩成字符串v1
    C++ Reference: Standard C++ Library reference: C Library: ctime: time
    98.验证二叉搜索树
    面试突击68:为什么 TCP 需要 3 次握手?
    Linux 基础入门
  • 原文地址:https://blog.csdn.net/woshiabc111/article/details/139978033