client.py
- import os
- import base64
- import numpy as np
- import pandas as pd
- import cv2
- import boto3
- # boto3安装: pip3 install opencv-python boto3
-
- config = {
- "region_name": "us-east-1",
- "endpoint_url": "https://s3.rapidcompute.com",
- # "image_bucket": "prod-barwaqt-image",
- "aws_access_key_id": "rcus_bd-prod",
- "aws_secret_access_key": "OgRKm6h...2HdbKA6s",
- }
- class Client:
- def __init__(self):
- self.config = config
- self.client = boto3.client('s3', **self.config)
-
- def read_image(self, bucket_name, image_key):
- try:
- response = self.client.get_object(Bucket=bucket_name, Key=image_key)
- body = response.get('Body')
- tmp = np.frombuffer(body.read(), np.uint8)
- image = cv2.imdecode(tmp, cv2.IMREAD_COLOR)
- return 'OK', image
- except Exception as e:
- return 'ERROR', 'READ_IMAGE_ERROR'
-
- def read_image_b64(self, bucket_name, image_key):
- status, image = self.read_image(bucket_name, image_key)
- if status == 'OK':
- retval, buffer = cv2.imencode('.jpg', image)
- pic_str = base64.b64encode(buffer)
- return status, pic_str.decode()
- else:
- return status, image
- def save_img(img_path, file_name='test.jpg'):
- status, img = client.read_image('prod-barwaqt-image', img_path)
- if status == 'OK':
- file_name = os.path.join('mydir', line.split('/')[-1])
- # os.makedirs('./mydir', exist_ok=True)
- cv2.imwrite(file_name, img)
- # cv2.imwrite(file_name, img, [cv2.IMWRITE_PNG_COMPRESSION, 8]) # 轻度压缩
- client = Client()
- img_path = 'prod/18/be56/18be564c36b05d730257dbbe87ede614.jpg'
- save_img(img_path)
- client = Client()
- df = pd.read_csv('img_path.csv')
- df['s3_path'].apply(save_img)
img_path.csv 如下:
| user_account_id | s3_path |
| 210805010001565250 | prod/12/e122/12e122b5328e1b5007b3de5c76e0bf02.jpg |
| 210812010008799851 | prod/26/92b7/2692b7c55bb71581586a6392926c0a24.jpg |
- from pandarallel import pandarallel
- pandarallel.initialize(nb_workers=10, use_memory_fs=False, progress_bar=True)
- client = Client() # 这里client一定要定义在外面
- df['img_path'].parallel_apply(save_img)
- def img_batch_zip(input_dir, output_dir):
- for item in os.listdir(input_dir):
- try:
- # 把jpeg、png格式的图片转换成jpg格式
- target_file_name = os.path.join(output_dir, item.split(".")[0] + ".jpg")
- # print(target_file_name)
- img = cv2.imread(input_dir + item)
- # print(img.shape)
- output_image = img_pad(img) # 重度压缩
- cv2.imwrite(target_file_name, output_image, [cv2.IMWRITE_PNG_COMPRESSION, 8])
- except Exception as e:
- pass
- def img_pad(pil_file):
- # h,w 先后不要写错,不然图片会变形
- h, w, c = pil_file.shape
- # print(h, w, c)
- fixed_size = 1600 # 输出正方形图片的尺寸
-
- if h >= w:
- factor = h / float(fixed_size)
- new_w = int(w / factor)
- if new_w % 2 != 0:
- new_w -= 1
- pil_file = cv2.resize(pil_file, (new_w, fixed_size))
- pad_w = int((fixed_size - new_w) / 2)
- array_file = np.array(pil_file)
- else:
- factor = w / float(fixed_size)
- new_h = int(h / factor)
- if new_h % 2 != 0:
- new_h -= 1
- pil_file = cv2.resize(pil_file, (fixed_size, new_h))
- pad_h = int((fixed_size - new_h) / 2)
- array_file = np.array(pil_file)
- return array_file
读取test_data.csv中的数据,将地址字段进行相应图片下载,追加至行末。
输入:test_data.csv

输出:res.xlsx

- # -*- coding: utf-8 -*-
- import os
- import pandas as pd
- import cv2
- import xlsxwriter
- import tqdm
-
- # 定义一个excel文件,并添加一个sheet
- BOOK = xlsxwriter.Workbook('res.xlsx')
- SHEET = BOOK.add_worksheet('sheet1')
- CEIL_HEIGHT = 256
- SHEET.set_default_row(CEIL_HEIGHT)
- SHEET.set_column(0, 18, CEIL_HEIGHT / 18)
1. 在一个单元格插入一张图片
- def inset_a_img(img_name, target_col): # target_col:插入的位置
- # 从本地文件夹读图片
- image_path = os.path.join("./mydir/", img_name)
- h, w, *_ = cv2.imread(image_path).shape
- scale = CEIL_HEIGHT * 1.3 / h
- SHEET.insert_image(line.Index + 1, target_col, image_path, # x_offset可调整x轴图片偏移
- {'x_offset': 100, 'y_offset': 2, 'x_scale': scale, 'y_scale': scale, 'positioning': 1})
2. 处理一行数据
- def insert_image(line):
- print('正在操作第几行: ', line.Index)
- print("该行有多少列: ", len(line))
-
- # 从第2列开始循环插入(第1列为索引)
- for i in range(1, len(line)):
- # print("正在操作第几列 col_no: ", i)
- if pd.isna(line[i]):
- SHEET.write(line.Index + 1, i-1, '') # 由于插入了表头,所以从第一行开始写
- else:
- SHEET.write(line.Index + 1, i-1, line[i])
-
- if i == 1 and not pd.isna(line.s3_path_1): # 当该列为s3_path_1,且其值不为空
- target_col = 3
- img_name = line.s3_path_1.split('/')[-1]
- print(img_name)
- inset_a_img(img_name, target_col)
- if i == 2 and not pd.isna(line.s3_path_2): # 为Nan的置空,不写入图片
- target_col = 5
- img_name = line.s3_path_2.split('/')[-1]
- inset_a_img(img_name, target_col)
-
- df = pd.read_csv('test_data.csv', dtype=str)
- col_list = ['user_account_id', 's3_path_1', 's3_path_2']
- df.columns = col_list
- # 为写入excel表头
- for i in range(len(col_list)):
- SHEET.write(0, i, col_list[i]) # 第0行第i列插入字段
-
- for line in tqdm.tqdm(df.itertuples()): # tqdm: 显示进度条
- # print(line)
- # 算上index列,每行有len(col_list)+1 列
- # Pandas(Index=0, user_account_id='21...346', s3_path_1='e4.jpg',
- # s3_path_2='fc.jpg')
- insert_image(line)
-
- BOOK.close()