""" 流式文件读取器 - 内存限制版本 确保大文件对比时不占用超过 256MB 的内存 """ import os from pathlib import Path from typing import Iterator, Optional, Tuple from django.conf import settings class ChunkedReadStream: """ 流式文件读取器(内存限制 256MB) 设计原则: 1. 单次读取不超过 8KB 2. 缓存大小限制 256MB 3. 支持流式哈希计算 4. 支持流式差异对比 5. 自动内存清理 """ # 内存限制:256MB MAX_MEMORY_BYTES = 256 * 1024 * 1024 # 默认分块大小:8KB DEFAULT_CHUNK_SIZE = 8192 # 最大缓存行数(用于差异对比) MAX_CACHED_LINES = 100000 def __init__( self, file_path: Path, chunk_size: int = DEFAULT_CHUNK_SIZE, encoding: str = 'utf-8' ): """ 初始化流式读取器 Args: file_path: 文件路径 chunk_size: 分块大小(字节) encoding: 文件编码 """ self.file_path = file_path self.chunk_size = chunk_size self.encoding = encoding self.file_size = file_path.stat().st_size if file_path.exists() else 0 # 文件句柄 self.file_handle = None self.is_open = False # 缓存(用于差异对比) self._cached_content = None self._cache_size = 0 def open(self): """打开文件""" self.file_handle = open( self.file_path, 'r', encoding=self.encoding, errors='ignore' ) self.is_open = True def close(self): """关闭文件并清理缓存""" if self.file_handle: self.file_handle.close() self.file_handle = None self.is_open = False self.clear_cache() def __enter__(self): """上下文管理器入口""" self.open() return self def __exit__(self, exc_type, exc_val, exc_tb): """上下文管理器出口""" self.close() def read_chunk(self) -> Optional[str]: """ 读取一个分块 Returns: 文件块内容,如果到达文件末尾则返回 None """ if not self.is_open: raise RuntimeError("File not opened") chunk = self.file_handle.read(self.chunk_size) if not chunk: return None # 检查内存限制 self._cache_size += len(chunk.encode(self.encoding)) if self._cache_size > self.MAX_MEMORY_BYTES: self.clear_cache() return chunk def read_chunks(self) -> Iterator[str]: """ 流式读取所有分块 Yields: 文件块内容 """ if not self.is_open: raise RuntimeError("File not opened") while True: chunk = self.read_chunk() if chunk is None: break yield chunk def read_all(self, limit_lines: Optional[int] = None) -> str: """ 读取完整内容(带内存限制) Args: limit_lines: 限制读取的行数(None 表示不限制) Returns: 文件内容 """ if not self.is_open: raise RuntimeError("File not opened") content_parts = [] line_count = 0 for chunk in self.read_chunks(): content_parts.append(chunk) # 检查行数限制 if limit_lines is not None: line_count += chunk.count('\n') if line_count >= limit_lines: break # 检查内存限制 current_size = sum(len(part.encode(self.encoding)) for part in content_parts) if current_size > self.MAX_MEMORY_BYTES: # 内存超限,截断内容 content_parts = content_parts[:limit_lines // 2] if limit_lines else content_parts[:1000] content_parts.append(f"\n... (内容已截断,超过 {self.MAX_MEMORY_BYTES // (1024*1024)}MB 限制) ...") break return ''.join(content_parts) def read_lines(self, max_lines: int = 1000) -> list: """ 读取文件行(限制行数,用于差异对比) Args: max_lines: 最大行数 Returns: 行列表(大文件只返回头尾) """ if not self.is_open: raise RuntimeError("File not opened") lines = [] for chunk in self.read_chunks(): chunk_lines = chunk.split('\n') lines.extend(chunk_lines) # 检查行数限制 if len(lines) > max_lines: # 保留头尾各一半 head = lines[:max_lines // 2] tail = lines[-max_lines // 2:] lines = head + [f"... (中间省略 {len(lines) - max_lines} 行) ..."] + tail break return lines def compute_hash(self) -> str: """ 流式计算文件哈希(不占用额外内存) Returns: SHA256 哈希值 """ import hashlib if not self.is_open: raise RuntimeError("File not opened") hash_obj = hashlib.sha256() # 重新打开文件(二进制模式) with open(self.file_path, 'rb') as f: while True: chunk = f.read(self.chunk_size) if not chunk: break hash_obj.update(chunk) return hash_obj.hexdigest() def get_file_info(self) -> dict: """ 获取文件信息 Returns: 文件信息字典 """ return { 'path': str(self.file_path), 'size': self.file_size, 'size_mb': round(self.file_size / (1024 * 1024), 2), 'chunk_size': self.chunk_size, 'max_memory_mb': self.MAX_MEMORY_BYTES // (1024 * 1024), } def clear_cache(self): """清理缓存""" self._cached_content = None self._cache_size = 0 class SmartDiffComparator: """ 智能差异对比器(内存限制版本) 设计原则: 1. 大文件只对比头尾 2. 中间部分计算哈希 3. 内存占用不超过 256MB """ def __init__(self, max_memory_mb: int = 256): self.max_memory_bytes = max_memory_mb * 1024 * 1024 self.chunk_size = 8192 def compare_files( self, file_a: Path, file_b: Path, max_lines: int = 1000 ) -> dict: """ 对比两个文件(内存限制版本) Args: file_a: 文件 A 路径 file_b: 文件 B 路径 max_lines: 最大显示行数 Returns: 差异信息 """ # 首先计算哈希 hash_a = self._compute_file_hash(file_a) hash_b = self._compute_file_hash(file_b) if hash_a == hash_b: return { 'has_diff': False, 'is_truncated': False, 'lines_changed': 0, 'hash_a': hash_a, 'hash_b': hash_b, } # 哈希不同,需要对比内容 with ChunkedReadStream(file_a, self.chunk_size) as reader_a, \ ChunkedReadStream(file_b, self.chunk_size) as reader_b: lines_a = reader_a.read_lines(max_lines) lines_b = reader_b.read_lines(max_lines) # 检查是否被截断 is_truncated = ( file_a.stat().st_size > 1024 * 1024 or # > 1MB file_b.stat().st_size > 1024 * 1024 ) # 计算变动行数 lines_changed = self._calculate_lines_changed( self._read_full_content(file_a), self._read_full_content(file_b) ) return { 'has_diff': True, 'is_truncated': is_truncated, 'lines_a': lines_a, 'lines_b': lines_b, 'lines_changed': lines_changed, 'hash_a': hash_a, 'hash_b': hash_b, } def _compute_file_hash(self, file_path: Path) -> str: """计算文件哈希""" import hashlib hash_obj = hashlib.sha256() with open(file_path, 'rb') as f: while True: chunk = f.read(self.chunk_size) if not chunk: break hash_obj.update(chunk) return hash_obj.hexdigest() def _read_full_content(self, file_path: Path) -> str: """读取完整文件内容(使用分块读取)""" content_parts = [] with ChunkedReadStream(file_path, self.chunk_size) as reader: for chunk in reader.read_chunks(): content_parts.append(chunk) return ''.join(content_parts) def _calculate_lines_changed(self, old_content: str, new_content: str) -> int: """计算变动行数""" old_lines = old_content.split('\n') if old_content else [] new_lines = new_content.split('\n') if new_content else [] old_set = set(old_lines) new_set = set(new_lines) added = len(new_set - old_set) removed = len(old_set - new_set) return added - removed class MemoryMonitor: """ 内存监控器 用于监控和限制内存使用 """ @staticmethod def get_current_memory_mb() -> float: """获取当前进程内存使用(MB)""" try: import psutil process = psutil.Process(os.getpid()) return process.memory_info().rss / (1024 * 1024) except ImportError: return 0.0 @staticmethod def check_memory_limit(max_memory_mb: int) -> bool: """检查是否超过内存限制""" current_memory = MemoryMonitor.get_current_memory_mb() return current_memory > max_memory_mb