2024-07-07 18:00:37 +08:00
|
|
|
|
# 对象存储文件处理类(示例:假设是 AWS S3)
|
|
|
|
|
import os
|
|
|
|
|
from typing import IO
|
2024-07-13 21:56:42 +08:00
|
|
|
|
from urllib.parse import urlparse, urljoin
|
2024-07-07 18:00:37 +08:00
|
|
|
|
|
|
|
|
|
from botocore.exceptions import ClientError
|
|
|
|
|
from smart_open import open
|
|
|
|
|
import boto3
|
|
|
|
|
from botocore.client import Config
|
|
|
|
|
|
2024-07-13 21:56:42 +08:00
|
|
|
|
from pywxdump.common.config.oss_config import storage_config
|
|
|
|
|
from pywxdump.file.Attachment import Attachment
|
|
|
|
|
from pywxdump.file.ConfigurableAttachment import ConfigurableAttachment
|
2024-07-07 18:00:37 +08:00
|
|
|
|
|
2024-07-13 21:56:42 +08:00
|
|
|
|
|
|
|
|
|
class S3Attachment(ConfigurableAttachment):
|
|
|
|
|
|
|
|
|
|
def __init__(self, s3_config: storage_config):
|
|
|
|
|
# S3 配置
|
|
|
|
|
self.s3_config = s3_config
|
|
|
|
|
# 校验配置
|
|
|
|
|
s3_config.validate_config()
|
2024-07-07 18:00:37 +08:00
|
|
|
|
|
|
|
|
|
# 创建 S3 客户端
|
|
|
|
|
self.s3_client = boto3.client(
|
|
|
|
|
's3',
|
2024-07-13 21:56:42 +08:00
|
|
|
|
endpoint_url=s3_config.endpoint_url,
|
|
|
|
|
aws_access_key_id=s3_config.access_key,
|
|
|
|
|
aws_secret_access_key=s3_config.secret_key,
|
2024-07-07 18:00:37 +08:00
|
|
|
|
config=Config(s3={"addressing_style": "virtual", "signature_version": 's3v4'})
|
|
|
|
|
)
|
|
|
|
|
|
2024-07-13 21:56:42 +08:00
|
|
|
|
@classmethod
|
|
|
|
|
def load_config(cls, config: storage_config) -> Attachment:
|
|
|
|
|
return cls(config)
|
|
|
|
|
|
|
|
|
|
def exists(self, s3_url) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
检查对象是否存在
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
s3_url (str): 对象路径
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
bool: 是否存在
|
|
|
|
|
"""
|
|
|
|
|
bucket_name, path = self.dealS3Url(s3_url)
|
|
|
|
|
# 尝试列出该路径下的对象
|
|
|
|
|
try:
|
|
|
|
|
response = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=path, MaxKeys=1)
|
|
|
|
|
if 'Contents' in response:
|
2024-07-07 18:00:37 +08:00
|
|
|
|
return True
|
2024-07-13 21:56:42 +08:00
|
|
|
|
else:
|
|
|
|
|
return False
|
|
|
|
|
except ClientError as e:
|
|
|
|
|
print(f"Error: {e}")
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def makedirs(self, s3_url) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
创建目录
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
s3_url (str): 目录路径
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
bool: 是否创建成功
|
|
|
|
|
"""
|
|
|
|
|
if not self.exists(s3_url):
|
|
|
|
|
bucket_name, path = self.dealS3Url(s3_url)
|
2024-07-07 18:00:37 +08:00
|
|
|
|
self.s3_client.put_object(Bucket=bucket_name, Key=f'{path}/')
|
|
|
|
|
return True
|
|
|
|
|
|
2024-07-13 21:56:42 +08:00
|
|
|
|
def open(self, s3_url, mode) -> IO:
|
|
|
|
|
"""
|
|
|
|
|
打开文件
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
s3_url (str): 文件路径
|
|
|
|
|
mode (str): 打开模式
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
IO: 文件对象
|
|
|
|
|
"""
|
|
|
|
|
return open(uri=s3_url, mode=mode, transport_params={'client': self.s3_client})
|
|
|
|
|
|
|
|
|
|
def remove(self, s3_url: str) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
删除文件
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
s3_url (str): 文件路径
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
bool: 是否删除成功
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if not self.exists(s3_url):
|
|
|
|
|
raise FileNotFoundError(f"File not found: {s3_url}")
|
|
|
|
|
|
|
|
|
|
if self.isdir(s3_url):
|
|
|
|
|
raise ValueError(f"Path is not a file: {s3_url}")
|
|
|
|
|
|
|
|
|
|
bucket_name, path = self.dealS3Url(s3_url)
|
|
|
|
|
|
|
|
|
|
self.s3_client.delete_object(Bucket=bucket_name, Key=path)
|
|
|
|
|
return True
|
2024-07-07 18:00:37 +08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2024-07-13 21:56:42 +08:00
|
|
|
|
def join(cls, s3_url: str, *paths: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
连接路径
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
s3_url (str): 路径
|
|
|
|
|
*paths (str): 路径
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
str: 连接后的路径
|
|
|
|
|
"""
|
|
|
|
|
# 使用os.path.join连接路径
|
|
|
|
|
path = os.path.join(s3_url, *paths)
|
|
|
|
|
# 将所有反斜杠替换为正斜杠
|
|
|
|
|
return path.replace('\\', '/')
|
2024-07-07 18:00:37 +08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2024-07-13 21:56:42 +08:00
|
|
|
|
def dirname(cls, s3_url: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
返回路径的目录部分
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
s3_url (str): 路径
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
str: 路径的目录部分
|
|
|
|
|
"""
|
|
|
|
|
return os.path.dirname(s3_url)
|
2024-07-07 18:00:37 +08:00
|
|
|
|
|
|
|
|
|
@classmethod
|
2024-07-13 21:56:42 +08:00
|
|
|
|
def basename(cls, s3_url: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
返回路径的最后一个元素
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
s3_url (str): 路径
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
str: 路径的最后一个元素
|
|
|
|
|
"""
|
|
|
|
|
return os.path.basename(s3_url)
|
2024-07-07 18:00:37 +08:00
|
|
|
|
|
2024-07-13 21:56:42 +08:00
|
|
|
|
def dealS3Url(self, s3_url: str) -> object:
|
2024-07-07 18:00:37 +08:00
|
|
|
|
"""
|
|
|
|
|
解析 S3 URL 并返回存储桶名称和路径
|
|
|
|
|
|
|
|
|
|
参数:
|
2024-07-13 21:56:42 +08:00
|
|
|
|
s3_url (str): S3 URL
|
2024-07-07 18:00:37 +08:00
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
tuple: 包含存储桶名称和路径的元组
|
|
|
|
|
"""
|
2024-07-13 21:56:42 +08:00
|
|
|
|
parsed_url = urlparse(s3_url)
|
2024-07-07 18:00:37 +08:00
|
|
|
|
|
|
|
|
|
# 确保URL是S3 URL
|
|
|
|
|
if parsed_url.scheme != 's3':
|
|
|
|
|
raise ValueError("URL必须是S3 URL,格式为s3://bucket_name/path")
|
|
|
|
|
|
|
|
|
|
bucket_name = parsed_url.netloc
|
|
|
|
|
s3_path = parsed_url.path.lstrip('/')
|
|
|
|
|
|
|
|
|
|
return bucket_name, s3_path
|
|
|
|
|
|
2024-07-13 21:56:42 +08:00
|
|
|
|
def isdir(self, s3_url: str) -> bool:
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
判断是否为目录
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
s3_url (str): 文件路径
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
bool: 是否为目录
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# 确保目录路径以'/'结尾
|
|
|
|
|
if not s3_url.endswith('/'):
|
|
|
|
|
s3_url += '/'
|
|
|
|
|
|
|
|
|
|
bucket_name, path = self.dealS3Url(s3_url)
|
|
|
|
|
# 列出以该 key 为前缀的对象
|
|
|
|
|
response = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=path, MaxKeys=1)
|
|
|
|
|
|
|
|
|
|
if 'Contents' in response:
|
|
|
|
|
# 存在对象,判断是否为目录
|
|
|
|
|
if response['Contents'][0]['Key'] == path or not path.endswith('/'):
|
|
|
|
|
return False
|
|
|
|
|
else:
|
|
|
|
|
return True
|
|
|
|
|
else:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def getsize(self, s3_url) -> int:
|
|
|
|
|
"""
|
|
|
|
|
获取文件大小
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
path (str): 文件路径
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
int: 文件大小
|
|
|
|
|
"""
|
|
|
|
|
if not self.exists(s3_url):
|
|
|
|
|
raise FileNotFoundError(f"File not found: {s3_url}")
|
|
|
|
|
if self.isdir(s3_url):
|
|
|
|
|
return self._get_size_of_directory(s3_url)
|
|
|
|
|
else:
|
|
|
|
|
bucket_name, path = self.dealS3Url(s3_url)
|
|
|
|
|
response = self.s3_client.head_object(Bucket=bucket_name, Key=path)
|
|
|
|
|
return response['ContentLength']
|
|
|
|
|
|
|
|
|
|
def _get_size_of_directory(self, s3_url):
|
|
|
|
|
"""
|
|
|
|
|
获取目录大小
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
s3_url (str): 目录路径
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
int: 目录大小
|
|
|
|
|
"""
|
|
|
|
|
bucket_name, path = self.dealS3Url(s3_url)
|
|
|
|
|
total_size = 0
|
|
|
|
|
|
|
|
|
|
# 确保目录路径以'/'结尾
|
|
|
|
|
if not path.endswith('/'):
|
|
|
|
|
path += '/'
|
|
|
|
|
|
|
|
|
|
# 列出指定目录中的对象
|
|
|
|
|
response = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=path)
|
|
|
|
|
if 'Contents' in response:
|
|
|
|
|
for obj in response['Contents']:
|
|
|
|
|
total_size += obj['Size']
|
|
|
|
|
|
|
|
|
|
return total_size
|