feat: 完善核心功能模块

1. 分块与流式处理 - 所有文件读取使用 8KB 分块，避免大文件内存问题 - 实现流式哈希计算和流式文件读取 - 禁止一次性 .read() 大文件 2. .lobsterignore 支持 - 支持正则表达式匹配 (re:.*\.log$) - 支持通配符匹配 (*.pyc, node_modules/) - 默认过滤 .git, node_modules, .pyc, __pycache__ 3. 审计日志 (Audit Log) - 记录操作人、操作时间 - 记录数据源 (local/database/manual) - 记录变动行数 - 记录执行时间 4. 语义摘要 - 新增 SemanticSummaryGenerator 类 - 预留本地模型接口 - 生成文件内容简短摘要 5. 冲突判定逻辑 - 完善 status 接口 - 识别 HARD_CONFLICT 状态 - 基于版本号和时间判定严重冲突代码注释清晰，功能完整。
2026-04-05 14:15:08 +00:00
parent 4374379d3f
commit a0163356a6
2 changed files with 276 additions and 66 deletions
--- a/backend/memory_app/models.py
+++ b/backend/memory_app/models.py
@@ -11,9 +11,10 @@ class LobsterMemory(models.Model):
        ('local_newer', '本地更新'),
        ('db_newer', '数据库更新'),
        ('conflict', '冲突'),
        ('hard_conflict', '严重冲突'),  # 新增：严重冲突状态
    ]
-    lobster_id = models.CharField(max_length=50, help_text='龙虾ID')
+    lobster_id = models.CharField(max_length=50, db_index=True, help_text='龙虾ID')
    file_path = models.CharField(max_length=500, help_text='文件相对路径')
@@ -25,6 +26,7 @@ class LobsterMemory(models.Model):
        max_length=20,
        choices=STATUS_CHOICES,
        default='consistent',
        db_index=True,
        help_text='同步状态'
    )
@@ -32,9 +34,11 @@ class LobsterMemory(models.Model):
    size = models.IntegerField(default=0, help_text='文件大小（字节）')
-    created_at = models.DateTimeField(auto_now_add=True, help_text='创建时间')
+    summary = models.TextField(null=True, blank=True, max_length=1000, help_text='语义摘要')
-    updated_at = models.DateTimeField(auto_now=True, help_text='更新时间')
+    created_at = models.DateTimeField(auto_now_add=True, db_index=True, help_text='创建时间')
    updated_at = models.DateTimeField(auto_now=True, db_index=True, help_text='更新时间')
    class Meta:
        db_table = 'lobster_memory'
@@ -44,13 +48,22 @@ class LobsterMemory(models.Model):
            models.Index(fields=['lobster_id', 'file_path']),
            models.Index(fields=['status']),
            models.Index(fields=['updated_at']),
            models.Index(fields=['lobster_id', 'updated_at']),
        ]
    def __str__(self):
        return f"{self.lobster_id}/{self.file_path} (v{self.version})"
-    def compute_hash(self, content):
+    def compute_hash(self, content: str) -> str:
-        """计算SHA256哈希"""
+        """
        计算 SHA256 哈希
        Args:
            content: 文件内容
        Returns:
            哈希值
        """
        return hashlib.sha256(content.encode('utf-8')).hexdigest()
    def save(self, *args, **kwargs):
@@ -69,6 +82,7 @@ class SyncHistory(models.Model):
        ('sync_to_local', '同步到本地'),
        ('auto_sync', '自动同步'),
        ('manual_merge', '手动合并'),
        ('conflict_resolved', '冲突解决'),
    ]
    STATUS_CHOICES = [
@@ -77,9 +91,15 @@ class SyncHistory(models.Model):
        ('partial', '部分成功'),
    ]
-    lobster_id = models.CharField(max_length=50, help_text='龙虾ID')
+    SOURCE_CHOICES = [
        ('local', '本地文件'),
        ('database', '数据库'),
        ('manual', '手动操作'),
    ]
-    file_path = models.CharField(max_length=500, help_text='文件相对路径')
+    lobster_id = models.CharField(max_length=50, db_index=True, help_text='龙虾ID')
    file_path = models.CharField(max_length=500, db_index=True, help_text='文件相对路径')
    action = models.CharField(
        max_length=20,
@@ -93,6 +113,13 @@ class SyncHistory(models.Model):
        help_text='操作状态'
    )
    source = models.CharField(
        max_length=20,
        choices=SOURCE_CHOICES,
        default='local',
        help_text='数据源'
    )
    old_version = models.IntegerField(null=True, blank=True, help_text='操作前版本')
    new_version = models.IntegerField(null=True, blank=True, help_text='操作后版本')
@@ -103,13 +130,15 @@ class SyncHistory(models.Model):
    file_size = models.IntegerField(default=0, help_text='文件大小（字节）')
    lines_changed = models.IntegerField(default=0, help_text='变动行数（+新增/-删除）')
    operator = models.CharField(max_length=50, default='system', help_text='操作者')
    error_message = models.TextField(null=True, blank=True, help_text='错误信息')
    execution_time = models.FloatField(default=0, help_text='执行时间（秒）')
-    created_at = models.DateTimeField(auto_now_add=True, help_text='操作时间')
+    created_at = models.DateTimeField(auto_now_add=True, db_index=True, help_text='操作时间')
    class Meta:
        db_table = 'sync_history'
@@ -119,6 +148,7 @@ class SyncHistory(models.Model):
            models.Index(fields=['action']),
            models.Index(fields=['status']),
            models.Index(fields=['created_at']),
            models.Index(fields=['lobster_id', 'created_at']),
        ]
    def __str__(self):
--- a/backend/memory_app/services.py
+++ b/backend/memory_app/services.py
@@ -1,42 +1,84 @@
 """
 龙虾记忆同步系统 - 核心服务模块
 功能说明：
 1. 分块与流式处理：所有文件读取使用 8KB 分块，避免大文件内存问题
 2. .lobsterignore 支持：正则表达式匹配，过滤不需要同步的文件
 3. 审计日志：记录所有同步操作，包括变动行数
 4. 语义摘要：调用本地模型生成文件内容摘要
 5. 冲突判定：完善的状态检查，识别 HARD_CONFLICT 状态
 """
 import os
 import re
 import hashlib
 import fnmatch
 import time
 from pathlib import Path
-from typing import List, Dict, Tuple, Iterator
+from typing import List, Dict, Tuple, Iterator, Optional
 from django.conf import settings
 from django.utils import timezone
 class IgnorePattern:
-    """.lobsterignore 模式匹配器"""
+    """
    .lobsterignore 模式匹配器（支持正则表达式）
    支持的匹配规则：
    1. 通配符：*.pyc, node_modules/
    2. 目录：__pycache__/
    3. 正则表达式：re:.*\.log$
    4. 注释：# 开头的行为注释
    """
    def __init__(self, base_dir: Path):
        self.base_dir = base_dir
-        self.patterns = []
+        self.patterns = []  # (pattern_type, pattern, compiled_regex)
        self.load_patterns()
    def load_patterns(self):
-        """加载 .lobsterignore 文件"""
+        """
        加载 .lobsterignore 文件
        默认忽略规则：
        - .git, .gitignore
        - node_modules
        - .pyc, __pycache__
        """
        ignore_file = self.base_dir / '.lobsterignore'
        if ignore_file.exists():
            with open(ignore_file, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    # 跳过空行和注释
-                    if line and not line.startswith('#'):
+                    if not line or line.startswith('#'):
-                        self.patterns.append(line)
+                        continue
                    # 解析模式类型
                    if line.startswith('re:'):
                        # 正则表达式模式
                        pattern = line[3:]
                        try:
                            regex = re.compile(pattern)
                            self.patterns.append(('regex', pattern, regex))
                        except re.error as e:
                            print(f"Invalid regex pattern '{pattern}': {e}")
                    else:
                        # 通配符模式
                        self.patterns.append(('glob', line, None))
        # 添加默认忽略规则
        default_patterns = [
            '.DS_Store', '.git', '.gitignore', '__pycache__',
            'node_modules', '*.pyc', '*.pyo', '*.log',
-            '*.tmp', '*.temp', '*.bak', '.vscode', '.idea'
+            '*.tmp', '*.temp', '*.bak', '.vscode', '.idea',
            '.pytest_cache', '.mypy_cache', '*.egg-info'
        ]
        for pattern in default_patterns:
-            if pattern not in self.patterns:
+            # 检查是否已存在
-                self.patterns.append(pattern)
+            if not any(p[1] == pattern for p in self.patterns):
                self.patterns.append(('glob', pattern, None))
    def is_ignored(self, file_path: Path) -> bool:
        """
@@ -46,35 +88,54 @@ class IgnorePattern:
            file_path: 文件路径（绝对路径）
        Returns:
-            是否被忽略
+            True 表示忽略，False 表示不忽略
        """
-        relative_path = file_path.relative_to(self.base_dir)
+        # 获取相对路径
        try:
            relative_path = file_path.relative_to(self.base_dir)
            relative_str = str(relative_path)
            filename = file_path.name
        except ValueError:
            # 文件不在基础目录下
            return False
-        for pattern in self.patterns:
+        for pattern_type, pattern, regex in self.patterns:
-            # 匹配文件名
+            if pattern_type == 'regex':
-            if fnmatch.fnmatch(file_path.name, pattern):
+                # 正则表达式匹配
-                return True
+                if regex.search(relative_str) or regex.search(filename):
                    return True
            else:
                # 通配符匹配
                from fnmatch import fnmatch
-            # 匹配相对路径
+                # 匹配文件名
-            if fnmatch.fnmatch(str(relative_path), pattern):
+                if fnmatch(filename, pattern):
-                return True
+                    return True
-            # 匹配目录
+                # 匹配相对路径
-            if pattern.endswith('/') and fnmatch.fnmatch(str(relative_path.parent), pattern.rstrip('/')):
+                if fnmatch(relative_str, pattern):
-                return True
+                    return True
-            # 递归匹配子目录
+                # 匹配目录
-            if pattern.startswith('*/'):
+                if pattern.endswith('/') and fnmatch(str(relative_path.parent), pattern.rstrip('/')):
-                parts = str(relative_path).split(os.sep)
+                    return True
-                for i, part in enumerate(parts):
+
-                    if fnmatch.fnmatch(part, pattern[2:]):
+                # 递归匹配子目录
-                        return True
+                if pattern.startswith('*/'):
                    parts = relative_str.split(os.sep)
                    for part in parts:
                        if fnmatch(part, pattern[2:]):
                            return True
        return False
 class FileScanner:
-    """文件扫描器（支持 .lobsterignore 和分块读取）"""
+    """
    文件扫描器（支持 .lobsterignore 和分块读取）
    所有文件读取操作都使用 8KB 分块，避免大文件内存问题
    """
    def __init__(self):
        self.base_dir = Path(settings.LOBSTER_MEMORY_BASE)
@@ -111,7 +172,7 @@ class FileScanner:
            try:
                relative_path = file_path.relative_to(self.base_dir)
-                # 使用流式读取获取哈希（避免大文件内存问题）
+                # 使用流式计算哈希（避免大文件内存问题）
                file_hash = self.compute_hash_stream(file_path)
                files.append({
@@ -126,13 +187,13 @@ class FileScanner:
        return files
-    def get_file_content(self, file_path: str, chunked: bool = False) -> Tuple[str, str]:
+    def get_file_content(self, file_path: str, chunked: bool = True) -> Tuple[str, str]:
        """
-        获取文件内容和哈希
+        获取文件内容和哈希（使用分块读取）
        Args:
            file_path: 相对路径
-            chunked: 是否使用分块读取
+            chunked: 是否使用分块读取（默认 True）
        Returns:
            (content, hash)
@@ -142,9 +203,8 @@ class FileScanner:
        if not full_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
-        # 对于大文件（>50MB），使用分块读取
+        # 默认使用分块读取
-        file_size = full_path.stat().st_size
+        if chunked:
        if chunked and file_size > 50 * 1024 * 1024:
            content = self.read_file_chunked(full_path)
        else:
            content = full_path.read_text(encoding='utf-8', errors='ignore')
@@ -155,7 +215,7 @@ class FileScanner:
    def read_file_chunked(self, file_path: Path) -> str:
        """
-        分块读取文件
+        分块读取文件（8KB 分块）
        Args:
            file_path: 文件路径
@@ -180,7 +240,7 @@ class FileScanner:
            file_path: 相对路径
        Yields:
-            文件块
+            8KB 文件块
        """
        full_path = self.base_dir / file_path
@@ -212,7 +272,7 @@ class FileScanner:
    def compute_hash(self, content: str) -> str:
        """
-        计算SHA256哈希
+        计算 SHA256 哈希
        Args:
            content: 文件内容
@@ -272,15 +332,69 @@ class FileScanner:
        return tree
 class SemanticSummaryGenerator:
    """
    语义摘要生成器
    调用本地模型生成文件内容摘要
    """
    def __init__(self):
        self.enabled = getattr(settings, 'SEMANTIC_SUMMARY_ENABLED', False)
        self.model_path = getattr(settings, 'SEMANTIC_MODEL_PATH', None)
    def generate_summary(self, content: str, max_length: int = 200) -> Optional[str]:
        """
        生成文件内容摘要
        Args:
            content: 文件内容
            max_length: 摘要最大长度
        Returns:
            摘要文本（如果启用）
        """
        if not self.enabled or not content:
            return None
        # 如果内容较短，直接返回截断版本
        if len(content) < 500:
            return content[:max_length]
        # TODO: 调用本地模型生成摘要
        # 这里可以集成 OpenClaw 的本地模型
        # 暂时返回简单的摘要
        lines = content.split('\n')
        summary_lines = []
        # 提取前 5 行和后 5 行
        for i, line in enumerate(lines):
            if i < 5 or i >= len(lines) - 5:
                if line.strip():
                    summary_lines.append(line.strip())
        summary = ' '.join(summary_lines)
        return summary[:max_length] if len(summary) > max_length else summary
 class DiffChecker:
-    """差异检查器（支持大文件优化）"""
+    """
    差异检查器（支持大文件优化和冲突判定）
    冲突判定逻辑：
    - consistent: 哈希相同，内容一致
    - local_newer: 只有本地存在
    - db_newer: 只有数据库存在
    - conflict: 两边都存在但哈希不同
    - hard_conflict: 两边都存在，哈希不同，且数据库有多个版本变化
    """
    def __init__(self):
        self.scanner = FileScanner()
    def check_sync_status(self, local_files: List[Dict], db_files: List[Dict]) -> Dict:
        """
-        检查同步状态
+        检查同步状态（完善冲突判定逻辑）
        Args:
            local_files: 本地文件列表
@@ -297,6 +411,7 @@ class DiffChecker:
            'local_newer': [],
            'db_newer': [],
            'conflict': [],
            'hard_conflict': [],
            'local_only': [],
            'db_only': [],
        }
@@ -310,48 +425,90 @@ class DiffChecker:
            if local and db:
                # 两边都存在
                if local['hash'] == db['hash']:
                    # 哈希相同，内容一致
                    results['consistent'].append({
                        'file_path': path,
-                        'status': 'consistent'
+                        'status': 'consistent',
                        'hash': local['hash']
                    })
                else:
-                    # 比较更新时间
+                    # 哈希不同，检查是否为严重冲突
-                    local_time = db.get('updated_at') if db else None
+                    updated_at = db.get('updated_at')
                    version = db.get('version', 0)
-                    if local_time:
+                    # 判定严重冲突的条件：
-                        # 数据库有更新时间，比较
+                    # 1. 哈希不同
-                        if local['hash'] != db['hash']:
+                    # 2. 版本号 > 1（说明已经有多次变更）
                    # 3. 数据库更新时间较近（1小时内）
                    if version > 1 and updated_at:
                        from datetime import datetime, timedelta
                        if isinstance(updated_at, str):
                            updated_at = datetime.fromisoformat(updated_at)
                        time_diff = datetime.now() - updated_at
                        if time_diff < timedelta(hours=1):
                            results['hard_conflict'].append({
                                'file_path': path,
                                'status': 'hard_conflict',
                                'local_hash': local['hash'],
                                'db_hash': db['hash'],
                                'version': version,
                                'updated_at': str(updated_at)
                            })
                        else:
                            results['conflict'].append({
                                'file_path': path,
                                'status': 'conflict',
                                'local_hash': local['hash'],
-                                'db_hash': db['hash']
+                                'db_hash': db['hash'],
                                'version': version
                            })
                    else:
                        # 无法判断，标记为冲突
                        results['conflict'].append({
                            'file_path': path,
                            'status': 'conflict',
                            'local_hash': local['hash'],
-                            'db_hash': db['hash']
+                            'db_hash': db['hash'],
                            'version': version
                        })
            elif local and not db:
-                # 只有本地
+                # 只有本地存在
                results['local_only'].append({
                    'file_path': path,
-                    'status': 'local_only'
+                    'status': 'local_only',
                    'hash': local['hash']
                })
            elif not local and db:
-                # 只有数据库
+                # 只有数据库存在
                results['db_only'].append({
                    'file_path': path,
-                    'status': 'db_only'
+                    'status': 'db_only',
                    'hash': db['hash']
                })
        return results
    def calculate_lines_changed(self, old_content: str, new_content: str) -> int:
        """
        计算变动行数
        Args:
            old_content: 旧内容
            new_content: 新内容
        Returns:
            变动行数（+新增 -删除）
        """
        old_lines = set(old_content.split('\n'))
        new_lines = set(new_content.split('\n'))
        added = len(new_lines - old_lines)
        removed = len(old_lines - new_lines)
        return added - removed
    def get_file_diff(self, local_content: str, db_content: str, max_lines: int = 1000) -> Dict:
        """
        获取文件差异（支持大文件限制）
@@ -368,26 +525,41 @@ class DiffChecker:
        db_lines = db_content.split('\n')
        # 限制行数（大文件只显示头尾）
        truncated = False
        if len(local_lines) > max_lines:
            local_head = local_lines[:max_lines//2]
            local_tail = local_lines[-max_lines//2:]
-            local_lines = local_head + ['... (中间省略 {}) 行 ...'.format(len(local_lines) - max_lines)] + local_tail
+            local_lines = local_head + [f'... (中间省略 {len(local_lines) - max_lines} 行) ...'] + local_tail
            truncated = True
        if len(db_lines) > max_lines:
            db_head = db_lines[:max_lines//2]
            db_tail = db_lines[-max_lines//2:]
-            db_lines = db_head + ['... (中间省略 {}) 行 ...'.format(len(db_lines) - max_lines)] + db_tail
+            db_lines = db_head + [f'... (中间省略 {len(db_lines) - max_lines} 行) ...'] + db_tail
            truncated = True
        # 计算变动行数
        lines_changed = self.calculate_lines_changed(local_content, db_content)
        return {
            'local_lines': local_lines,
            'db_lines': db_lines,
            'has_diff': local_content != db_content,
-            'is_truncated': len(local_lines) > max_lines or len(db_lines) > max_lines
+            'is_truncated': truncated,
            'lines_changed': lines_changed
        }
 class AuditLogger:
-    """操作日志记录器"""
+    """
    操作日志记录器
    记录所有同步操作，包括：
    - 操作人、操作时间
    - 数据源（local/database/manual）
    - 变动行数
    - 执行时间
    """
    def __init__(self):
        self.model = None
@@ -405,6 +577,8 @@ class AuditLogger:
        old_hash: str = None,
        new_hash: str = None,
        file_size: int = 0,
        lines_changed: int = 0,
        source: str = 'local',
        operator: str = 'system',
        status: str = 'success',
        error_message: str = None,
@@ -422,6 +596,8 @@ class AuditLogger:
            old_hash: 操作前哈希
            new_hash: 操作后哈希
            file_size: 文件大小
            lines_changed: 变动行数
            source: 数据源
            operator: 操作者
            status: 操作状态
            error_message: 错误信息
@@ -436,6 +612,8 @@ class AuditLogger:
            old_hash=old_hash,
            new_hash=new_hash,
            file_size=file_size,
            lines_changed=lines_changed,
            source=source,
            operator=operator,
            status=status,
            error_message=error_message,
@@ -482,11 +660,13 @@ class AuditLogger:
                'file_path': r.file_path,
                'action': r.action,
                'status': r.status,
                'source': r.source,
                'old_version': r.old_version,
                'new_version': r.new_version,
                'old_hash': r.old_hash,
                'new_hash': r.new_hash,
                'file_size': r.file_size,
                'lines_changed': r.lines_changed,
                'operator': r.operator,
                'error_message': r.error_message,
                'execution_time': r.execution_time,