361 lines
9.8 KiB
Python
361 lines
9.8 KiB
Python
|
|
"""
|
|||
|
|
流式文件读取器 - 内存限制版本
|
|||
|
|
|
|||
|
|
确保大文件对比时不占用超过 256MB 的内存
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Iterator, Optional, Tuple
|
|||
|
|
from django.conf import settings
|
|||
|
|
|
|||
|
|
|
|||
|
|
class ChunkedReadStream:
|
|||
|
|
"""
|
|||
|
|
流式文件读取器(内存限制 256MB)
|
|||
|
|
|
|||
|
|
设计原则:
|
|||
|
|
1. 单次读取不超过 8KB
|
|||
|
|
2. 缓存大小限制 256MB
|
|||
|
|
3. 支持流式哈希计算
|
|||
|
|
4. 支持流式差异对比
|
|||
|
|
5. 自动内存清理
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
# 内存限制:256MB
|
|||
|
|
MAX_MEMORY_BYTES = 256 * 1024 * 1024
|
|||
|
|
|
|||
|
|
# 默认分块大小:8KB
|
|||
|
|
DEFAULT_CHUNK_SIZE = 8192
|
|||
|
|
|
|||
|
|
# 最大缓存行数(用于差异对比)
|
|||
|
|
MAX_CACHED_LINES = 100000
|
|||
|
|
|
|||
|
|
def __init__(
|
|||
|
|
self,
|
|||
|
|
file_path: Path,
|
|||
|
|
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|||
|
|
encoding: str = 'utf-8'
|
|||
|
|
):
|
|||
|
|
"""
|
|||
|
|
初始化流式读取器
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: 文件路径
|
|||
|
|
chunk_size: 分块大小(字节)
|
|||
|
|
encoding: 文件编码
|
|||
|
|
"""
|
|||
|
|
self.file_path = file_path
|
|||
|
|
self.chunk_size = chunk_size
|
|||
|
|
self.encoding = encoding
|
|||
|
|
self.file_size = file_path.stat().st_size if file_path.exists() else 0
|
|||
|
|
|
|||
|
|
# 文件句柄
|
|||
|
|
self.file_handle = None
|
|||
|
|
self.is_open = False
|
|||
|
|
|
|||
|
|
# 缓存(用于差异对比)
|
|||
|
|
self._cached_content = None
|
|||
|
|
self._cache_size = 0
|
|||
|
|
|
|||
|
|
def open(self):
|
|||
|
|
"""打开文件"""
|
|||
|
|
self.file_handle = open(
|
|||
|
|
self.file_path,
|
|||
|
|
'r',
|
|||
|
|
encoding=self.encoding,
|
|||
|
|
errors='ignore'
|
|||
|
|
)
|
|||
|
|
self.is_open = True
|
|||
|
|
|
|||
|
|
def close(self):
|
|||
|
|
"""关闭文件并清理缓存"""
|
|||
|
|
if self.file_handle:
|
|||
|
|
self.file_handle.close()
|
|||
|
|
self.file_handle = None
|
|||
|
|
self.is_open = False
|
|||
|
|
self.clear_cache()
|
|||
|
|
|
|||
|
|
def __enter__(self):
|
|||
|
|
"""上下文管理器入口"""
|
|||
|
|
self.open()
|
|||
|
|
return self
|
|||
|
|
|
|||
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|||
|
|
"""上下文管理器出口"""
|
|||
|
|
self.close()
|
|||
|
|
|
|||
|
|
def read_chunk(self) -> Optional[str]:
|
|||
|
|
"""
|
|||
|
|
读取一个分块
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文件块内容,如果到达文件末尾则返回 None
|
|||
|
|
"""
|
|||
|
|
if not self.is_open:
|
|||
|
|
raise RuntimeError("File not opened")
|
|||
|
|
|
|||
|
|
chunk = self.file_handle.read(self.chunk_size)
|
|||
|
|
if not chunk:
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 检查内存限制
|
|||
|
|
self._cache_size += len(chunk.encode(self.encoding))
|
|||
|
|
if self._cache_size > self.MAX_MEMORY_BYTES:
|
|||
|
|
self.clear_cache()
|
|||
|
|
|
|||
|
|
return chunk
|
|||
|
|
|
|||
|
|
def read_chunks(self) -> Iterator[str]:
|
|||
|
|
"""
|
|||
|
|
流式读取所有分块
|
|||
|
|
|
|||
|
|
Yields:
|
|||
|
|
文件块内容
|
|||
|
|
"""
|
|||
|
|
if not self.is_open:
|
|||
|
|
raise RuntimeError("File not opened")
|
|||
|
|
|
|||
|
|
while True:
|
|||
|
|
chunk = self.read_chunk()
|
|||
|
|
if chunk is None:
|
|||
|
|
break
|
|||
|
|
yield chunk
|
|||
|
|
|
|||
|
|
def read_all(self, limit_lines: Optional[int] = None) -> str:
|
|||
|
|
"""
|
|||
|
|
读取完整内容(带内存限制)
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
limit_lines: 限制读取的行数(None 表示不限制)
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文件内容
|
|||
|
|
"""
|
|||
|
|
if not self.is_open:
|
|||
|
|
raise RuntimeError("File not opened")
|
|||
|
|
|
|||
|
|
content_parts = []
|
|||
|
|
line_count = 0
|
|||
|
|
|
|||
|
|
for chunk in self.read_chunks():
|
|||
|
|
content_parts.append(chunk)
|
|||
|
|
|
|||
|
|
# 检查行数限制
|
|||
|
|
if limit_lines is not None:
|
|||
|
|
line_count += chunk.count('\n')
|
|||
|
|
if line_count >= limit_lines:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 检查内存限制
|
|||
|
|
current_size = sum(len(part.encode(self.encoding)) for part in content_parts)
|
|||
|
|
if current_size > self.MAX_MEMORY_BYTES:
|
|||
|
|
# 内存超限,截断内容
|
|||
|
|
content_parts = content_parts[:limit_lines // 2] if limit_lines else content_parts[:1000]
|
|||
|
|
content_parts.append(f"\n... (内容已截断,超过 {self.MAX_MEMORY_BYTES // (1024*1024)}MB 限制) ...")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
return ''.join(content_parts)
|
|||
|
|
|
|||
|
|
def read_lines(self, max_lines: int = 1000) -> list:
|
|||
|
|
"""
|
|||
|
|
读取文件行(限制行数,用于差异对比)
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
max_lines: 最大行数
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
行列表(大文件只返回头尾)
|
|||
|
|
"""
|
|||
|
|
if not self.is_open:
|
|||
|
|
raise RuntimeError("File not opened")
|
|||
|
|
|
|||
|
|
lines = []
|
|||
|
|
for chunk in self.read_chunks():
|
|||
|
|
chunk_lines = chunk.split('\n')
|
|||
|
|
lines.extend(chunk_lines)
|
|||
|
|
|
|||
|
|
# 检查行数限制
|
|||
|
|
if len(lines) > max_lines:
|
|||
|
|
# 保留头尾各一半
|
|||
|
|
head = lines[:max_lines // 2]
|
|||
|
|
tail = lines[-max_lines // 2:]
|
|||
|
|
lines = head + [f"... (中间省略 {len(lines) - max_lines} 行) ..."] + tail
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
return lines
|
|||
|
|
|
|||
|
|
def compute_hash(self) -> str:
|
|||
|
|
"""
|
|||
|
|
流式计算文件哈希(不占用额外内存)
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
SHA256 哈希值
|
|||
|
|
"""
|
|||
|
|
import hashlib
|
|||
|
|
|
|||
|
|
if not self.is_open:
|
|||
|
|
raise RuntimeError("File not opened")
|
|||
|
|
|
|||
|
|
hash_obj = hashlib.sha256()
|
|||
|
|
|
|||
|
|
# 重新打开文件(二进制模式)
|
|||
|
|
with open(self.file_path, 'rb') as f:
|
|||
|
|
while True:
|
|||
|
|
chunk = f.read(self.chunk_size)
|
|||
|
|
if not chunk:
|
|||
|
|
break
|
|||
|
|
hash_obj.update(chunk)
|
|||
|
|
|
|||
|
|
return hash_obj.hexdigest()
|
|||
|
|
|
|||
|
|
def get_file_info(self) -> dict:
|
|||
|
|
"""
|
|||
|
|
获取文件信息
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文件信息字典
|
|||
|
|
"""
|
|||
|
|
return {
|
|||
|
|
'path': str(self.file_path),
|
|||
|
|
'size': self.file_size,
|
|||
|
|
'size_mb': round(self.file_size / (1024 * 1024), 2),
|
|||
|
|
'chunk_size': self.chunk_size,
|
|||
|
|
'max_memory_mb': self.MAX_MEMORY_BYTES // (1024 * 1024),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def clear_cache(self):
|
|||
|
|
"""清理缓存"""
|
|||
|
|
self._cached_content = None
|
|||
|
|
self._cache_size = 0
|
|||
|
|
|
|||
|
|
|
|||
|
|
class SmartDiffComparator:
|
|||
|
|
"""
|
|||
|
|
智能差异对比器(内存限制版本)
|
|||
|
|
|
|||
|
|
设计原则:
|
|||
|
|
1. 大文件只对比头尾
|
|||
|
|
2. 中间部分计算哈希
|
|||
|
|
3. 内存占用不超过 256MB
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
def __init__(self, max_memory_mb: int = 256):
|
|||
|
|
self.max_memory_bytes = max_memory_mb * 1024 * 1024
|
|||
|
|
self.chunk_size = 8192
|
|||
|
|
|
|||
|
|
def compare_files(
|
|||
|
|
self,
|
|||
|
|
file_a: Path,
|
|||
|
|
file_b: Path,
|
|||
|
|
max_lines: int = 1000
|
|||
|
|
) -> dict:
|
|||
|
|
"""
|
|||
|
|
对比两个文件(内存限制版本)
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_a: 文件 A 路径
|
|||
|
|
file_b: 文件 B 路径
|
|||
|
|
max_lines: 最大显示行数
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
差异信息
|
|||
|
|
"""
|
|||
|
|
# 首先计算哈希
|
|||
|
|
hash_a = self._compute_file_hash(file_a)
|
|||
|
|
hash_b = self._compute_file_hash(file_b)
|
|||
|
|
|
|||
|
|
if hash_a == hash_b:
|
|||
|
|
return {
|
|||
|
|
'has_diff': False,
|
|||
|
|
'is_truncated': False,
|
|||
|
|
'lines_changed': 0,
|
|||
|
|
'hash_a': hash_a,
|
|||
|
|
'hash_b': hash_b,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 哈希不同,需要对比内容
|
|||
|
|
with ChunkedReadStream(file_a, self.chunk_size) as reader_a, \
|
|||
|
|
ChunkedReadStream(file_b, self.chunk_size) as reader_b:
|
|||
|
|
|
|||
|
|
lines_a = reader_a.read_lines(max_lines)
|
|||
|
|
lines_b = reader_b.read_lines(max_lines)
|
|||
|
|
|
|||
|
|
# 检查是否被截断
|
|||
|
|
is_truncated = (
|
|||
|
|
file_a.stat().st_size > 1024 * 1024 or # > 1MB
|
|||
|
|
file_b.stat().st_size > 1024 * 1024
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 计算变动行数
|
|||
|
|
lines_changed = self._calculate_lines_changed(
|
|||
|
|
self._read_full_content(file_a),
|
|||
|
|
self._read_full_content(file_b)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
'has_diff': True,
|
|||
|
|
'is_truncated': is_truncated,
|
|||
|
|
'lines_a': lines_a,
|
|||
|
|
'lines_b': lines_b,
|
|||
|
|
'lines_changed': lines_changed,
|
|||
|
|
'hash_a': hash_a,
|
|||
|
|
'hash_b': hash_b,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def _compute_file_hash(self, file_path: Path) -> str:
|
|||
|
|
"""计算文件哈希"""
|
|||
|
|
import hashlib
|
|||
|
|
hash_obj = hashlib.sha256()
|
|||
|
|
with open(file_path, 'rb') as f:
|
|||
|
|
while True:
|
|||
|
|
chunk = f.read(self.chunk_size)
|
|||
|
|
if not chunk:
|
|||
|
|
break
|
|||
|
|
hash_obj.update(chunk)
|
|||
|
|
return hash_obj.hexdigest()
|
|||
|
|
|
|||
|
|
def _read_full_content(self, file_path: Path) -> str:
|
|||
|
|
"""读取完整文件内容(使用分块读取)"""
|
|||
|
|
content_parts = []
|
|||
|
|
with ChunkedReadStream(file_path, self.chunk_size) as reader:
|
|||
|
|
for chunk in reader.read_chunks():
|
|||
|
|
content_parts.append(chunk)
|
|||
|
|
return ''.join(content_parts)
|
|||
|
|
|
|||
|
|
def _calculate_lines_changed(self, old_content: str, new_content: str) -> int:
|
|||
|
|
"""计算变动行数"""
|
|||
|
|
old_lines = old_content.split('\n') if old_content else []
|
|||
|
|
new_lines = new_content.split('\n') if new_content else []
|
|||
|
|
|
|||
|
|
old_set = set(old_lines)
|
|||
|
|
new_set = set(new_lines)
|
|||
|
|
|
|||
|
|
added = len(new_set - old_set)
|
|||
|
|
removed = len(old_set - new_set)
|
|||
|
|
|
|||
|
|
return added - removed
|
|||
|
|
|
|||
|
|
|
|||
|
|
class MemoryMonitor:
|
|||
|
|
"""
|
|||
|
|
内存监控器
|
|||
|
|
|
|||
|
|
用于监控和限制内存使用
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def get_current_memory_mb() -> float:
|
|||
|
|
"""获取当前进程内存使用(MB)"""
|
|||
|
|
try:
|
|||
|
|
import psutil
|
|||
|
|
process = psutil.Process(os.getpid())
|
|||
|
|
return process.memory_info().rss / (1024 * 1024)
|
|||
|
|
except ImportError:
|
|||
|
|
return 0.0
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def check_memory_limit(max_memory_mb: int) -> bool:
|
|||
|
|
"""检查是否超过内存限制"""
|
|||
|
|
current_memory = MemoryMonitor.get_current_memory_mb()
|
|||
|
|
return current_memory > max_memory_mb
|