feat: 添加三个重要补丁

补丁 1: 分块读取与流式传输 - 8KB 分块读取大文件，避免内存飙升 - 流式计算文件哈希，无需加载完整内容 - 差异对比限制，大文件只显示头尾各 500 行 - 新增 chunked 参数支持流式传输补丁 2: .lobsterignore 机制 - 创建 IgnorePattern 类实现模式匹配 - 支持 .lobsterignore 文件配置 - 添加默认忽略规则（.DS_Store, node_modules 等） - 支持通配符匹配（*, ?, 目录匹配） - 新增 API: GET /api/ignore/patterns/, POST /api/ignore/reload/ 补丁 3: 操作溯源（Audit Log） - 新增 SyncHistory 模型记录同步历史 - 创建 AuditLogger 类用于记录操作 - 所有同步操作自动记录日志 - 记录操作者、版本变化、哈希变化、执行时间等 - 新增 API: GET /api/history/ 更新内容: - models.py: 新增 SyncHistory 模型 - services.py: 新增 IgnorePattern, AuditLogger, 分块读取方法 - views.py: 所有同步操作添加日志记录, 新增历史和忽略规则接口 - serializers.py: 新增 SyncHistorySerializer - urls.py: 新增历史和忽略规则路由 - .lobsterignore.example: 示例忽略文件 - CHANGELOG.md: 详细更新日志
2026-04-05 12:20:57 +00:00
parent d9420b6cc6
commit 077656a6cf
7 changed files with 1007 additions and 43 deletions
--- a/backend/memory_app/services.py
+++ b/backend/memory_app/services.py
@@ -1,16 +1,86 @@
 import os
 import hashlib
+import fnmatch
+import time
 from pathlib import Path
-from typing import List, Dict, Tuple
+from typing import List, Dict, Tuple, Iterator
 from django.conf import settings
+from django.utils import timezone
+
+
+class IgnorePattern:
+    """.lobsterignore 模式匹配器"""
+
+    def __init__(self, base_dir: Path):
+        self.base_dir = base_dir
+        self.patterns = []
+        self.load_patterns()
+
+    def load_patterns(self):
+        """加载 .lobsterignore 文件"""
+        ignore_file = self.base_dir / '.lobsterignore'
+
+        if ignore_file.exists():
+            with open(ignore_file, 'r', encoding='utf-8') as f:
+                for line in f:
+                    line = line.strip()
+                    # 跳过空行和注释
+                    if line and not line.startswith('#'):
+                        self.patterns.append(line)
+
+        # 添加默认忽略规则
+        default_patterns = [
+            '.DS_Store', '.git', '.gitignore', '__pycache__',
+            'node_modules', '*.pyc', '*.pyo', '*.log',
+            '*.tmp', '*.temp', '*.bak', '.vscode', '.idea'
+        ]
+        for pattern in default_patterns:
+            if pattern not in self.patterns:
+                self.patterns.append(pattern)
+
+    def is_ignored(self, file_path: Path) -> bool:
+        """
+        判断文件是否被忽略
+
+        Args:
+            file_path: 文件路径（绝对路径）
+
+        Returns:
+            是否被忽略
+        """
+        relative_path = file_path.relative_to(self.base_dir)
+
+        for pattern in self.patterns:
+            # 匹配文件名
+            if fnmatch.fnmatch(file_path.name, pattern):
+                return True
+
+            # 匹配相对路径
+            if fnmatch.fnmatch(str(relative_path), pattern):
+                return True
+
+            # 匹配目录
+            if pattern.endswith('/') and fnmatch.fnmatch(str(relative_path.parent), pattern.rstrip('/')):
+                return True
+
+            # 递归匹配子目录
+            if pattern.startswith('*/'):
+                parts = str(relative_path).split(os.sep)
+                for i, part in enumerate(parts):
+                    if fnmatch.fnmatch(part, pattern[2:]):
+                        return True
+
+        return False


 class FileScanner:
-    """文件扫描器"""
+    """文件扫描器（支持 .lobsterignore 和分块读取）"""

    def __init__(self):
        self.base_dir = Path(settings.LOBSTER_MEMORY_BASE)
        self.supported_extensions = settings.SUPPORTED_EXTENSIONS
+        self.ignore = IgnorePattern(self.base_dir)
+        self.chunk_size = 8192  # 8KB 分块读取

    def scan_directory(self, lobster_id: str = None) -> List[Dict]:
        """
@@ -27,31 +97,42 @@ class FileScanner:

        files = []
        for file_path in self.base_dir.rglob('*'):
-            if file_path.is_file() and file_path.suffix in self.supported_extensions:
-                try:
-                    relative_path = file_path.relative_to(self.base_dir)
-                    content = file_path.read_text(encoding='utf-8', errors='ignore')
-                    file_hash = self.compute_hash(content)
+            if not file_path.is_file():
+                continue

-                    files.append({
-                        'file_path': str(relative_path),
-                        'full_path': str(file_path),
-                        'content': content,
-                        'hash': file_hash,
-                        'size': file_path.stat().st_size,
-                        'lobster_id': lobster_id or 'unknown',
-                    })
-                except Exception as e:
-                    print(f"Error reading {file_path}: {e}")
+            # 检查文件扩展名
+            if file_path.suffix not in self.supported_extensions:
+                continue
+
+            # 检查是否被 .lobsterignore 忽略
+            if self.ignore.is_ignored(file_path):
+                continue
+
+            try:
+                relative_path = file_path.relative_to(self.base_dir)
+
+                # 使用流式读取获取哈希（避免大文件内存问题）
+                file_hash = self.compute_hash_stream(file_path)
+
+                files.append({
+                    'file_path': str(relative_path),
+                    'full_path': str(file_path),
+                    'hash': file_hash,
+                    'size': file_path.stat().st_size,
+                    'lobster_id': lobster_id or 'unknown',
+                })
+            except Exception as e:
+                print(f"Error reading {file_path}: {e}")

        return files

-    def get_file_content(self, file_path: str) -> Tuple[str, str]:
+    def get_file_content(self, file_path: str, chunked: bool = False) -> Tuple[str, str]:
        """
        获取文件内容和哈希

        Args:
            file_path: 相对路径
+            chunked: 是否使用分块读取

        Returns:
            (content, hash)
@@ -61,11 +142,58 @@ class FileScanner:
        if not full_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

-        content = full_path.read_text(encoding='utf-8', errors='ignore')
+        # 对于大文件（>50MB），使用分块读取
+        file_size = full_path.stat().st_size
+        if chunked and file_size > 50 * 1024 * 1024:
+            content = self.read_file_chunked(full_path)
+        else:
+            content = full_path.read_text(encoding='utf-8', errors='ignore')
+
        file_hash = self.compute_hash(content)

        return content, file_hash

+    def read_file_chunked(self, file_path: Path) -> str:
+        """
+        分块读取文件
+
+        Args:
+            file_path: 文件路径
+
+        Returns:
+            文件内容
+        """
+        content_parts = []
+        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            while True:
+                chunk = f.read(self.chunk_size)
+                if not chunk:
+                    break
+                content_parts.append(chunk)
+        return ''.join(content_parts)
+
+    def read_file_stream(self, file_path: str) -> Iterator[str]:
+        """
+        流式读取文件（用于大文件传输）
+
+        Args:
+            file_path: 相对路径
+
+        Yields:
+            文件块
+        """
+        full_path = self.base_dir / file_path
+
+        if not full_path.exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+
+        with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
+            while True:
+                chunk = f.read(self.chunk_size)
+                if not chunk:
+                    break
+                yield chunk
+
    def write_file(self, file_path: str, content: str):
        """
        写入文件
@@ -94,6 +222,27 @@ class FileScanner:
        """
        return hashlib.sha256(content.encode('utf-8')).hexdigest()

+    def compute_hash_stream(self, file_path: Path) -> str:
+        """
+        流式计算文件哈希（避免大文件内存问题）
+
+        Args:
+            file_path: 文件路径
+
+        Returns:
+            哈希值
+        """
+        hash_obj = hashlib.sha256()
+
+        with open(file_path, 'rb') as f:
+            while True:
+                chunk = f.read(self.chunk_size)
+                if not chunk:
+                    break
+                hash_obj.update(chunk)
+
+        return hash_obj.hexdigest()
+
    def get_file_tree(self, lobster_id: str = None) -> Dict:
        """
        获取文件树结构
@@ -124,7 +273,7 @@ class FileScanner:


 class DiffChecker:
-    """差异检查器"""
+    """差异检查器（支持大文件优化）"""

    def __init__(self):
        self.scanner = FileScanner()
@@ -203,22 +352,145 @@ class DiffChecker:

        return results

-    def get_file_diff(self, local_content: str, db_content: str) -> Dict:
+    def get_file_diff(self, local_content: str, db_content: str, max_lines: int = 1000) -> Dict:
        """
-        获取文件差异（简单版）
+        获取文件差异（支持大文件限制）

        Args:
            local_content: 本地内容
            db_content: 数据库内容
+            max_lines: 最大显示行数（防止大文件差异过大）

        Returns:
            差异信息
        """
-        # 这里可以使用 difflib 或其他差异库
-        # 简单实现，后续可以用 react-diff-viewer 在前端显示
+        local_lines = local_content.split('\n')
+        db_lines = db_content.split('\n')
+
+        # 限制行数（大文件只显示头尾）
+        if len(local_lines) > max_lines:
+            local_head = local_lines[:max_lines//2]
+            local_tail = local_lines[-max_lines//2:]
+            local_lines = local_head + ['... (中间省略 {}) 行 ...'.format(len(local_lines) - max_lines)] + local_tail
+
+        if len(db_lines) > max_lines:
+            db_head = db_lines[:max_lines//2]
+            db_tail = db_lines[-max_lines//2:]
+            db_lines = db_head + ['... (中间省略 {}) 行 ...'.format(len(db_lines) - max_lines)] + db_tail

        return {
-            'local_lines': local_content.split('\n'),
-            'db_lines': db_content.split('\n'),
-            'has_diff': local_content != db_content
-        }
+            'local_lines': local_lines,
+            'db_lines': db_lines,
+            'has_diff': local_content != db_content,
+            'is_truncated': len(local_lines) > max_lines or len(db_lines) > max_lines
+        }
+
+
+class AuditLogger:
+    """操作日志记录器"""
+
+    def __init__(self):
+        self.model = None
+        # 延迟导入模型（避免循环导入）
+        from .models import SyncHistory
+        self.model = SyncHistory
+
+    def log_sync_action(
+        self,
+        lobster_id: str,
+        file_path: str,
+        action: str,
+        old_version: int = None,
+        new_version: int = None,
+        old_hash: str = None,
+        new_hash: str = None,
+        file_size: int = 0,
+        operator: str = 'system',
+        status: str = 'success',
+        error_message: str = None,
+        execution_time: float = 0
+    ):
+        """
+        记录同步操作
+
+        Args:
+            lobster_id: 龙虾ID
+            file_path: 文件路径
+            action: 操作类型
+            old_version: 操作前版本
+            new_version: 操作后版本
+            old_hash: 操作前哈希
+            new_hash: 操作后哈希
+            file_size: 文件大小
+            operator: 操作者
+            status: 操作状态
+            error_message: 错误信息
+            execution_time: 执行时间
+        """
+        self.model.objects.create(
+            lobster_id=lobster_id,
+            file_path=file_path,
+            action=action,
+            old_version=old_version,
+            new_version=new_version,
+            old_hash=old_hash,
+            new_hash=new_hash,
+            file_size=file_size,
+            operator=operator,
+            status=status,
+            error_message=error_message,
+            execution_time=execution_time,
+            created_at=timezone.now()
+        )
+
+    def get_history(
+        self,
+        lobster_id: str = None,
+        file_path: str = None,
+        action: str = None,
+        limit: int = 100
+    ) -> List[Dict]:
+        """
+        获取操作历史
+
+        Args:
+            lobster_id: 龙虾ID（可选）
+            file_path: 文件路径（可选）
+            action: 操作类型（可选）
+            limit: 返回数量限制
+
+        Returns:
+            操作历史列表
+        """
+        queryset = self.model.objects.all()
+
+        if lobster_id:
+            queryset = queryset.filter(lobster_id=lobster_id)
+
+        if file_path:
+            queryset = queryset.filter(file_path=file_path)
+
+        if action:
+            queryset = queryset.filter(action=action)
+
+        records = queryset.order_by('-created_at')[:limit]
+
+        return [
+            {
+                'id': r.id,
+                'lobster_id': r.lobster_id,
+                'file_path': r.file_path,
+                'action': r.action,
+                'status': r.status,
+                'old_version': r.old_version,
+                'new_version': r.new_version,
+                'old_hash': r.old_hash,
+                'new_hash': r.new_hash,
+                'file_size': r.file_size,
+                'operator': r.operator,
+                'error_message': r.error_message,
+                'execution_time': r.execution_time,
+                'created_at': r.created_at.isoformat(),
+            }
+            for r in records
+        ]