feat: 完善核心功能模块

1. 分块与流式处理
- 所有文件读取使用 8KB 分块,避免大文件内存问题
- 实现流式哈希计算和流式文件读取
- 禁止一次性 .read() 大文件

2. .lobsterignore 支持
- 支持正则表达式匹配 (re:.*\.log$)
- 支持通配符匹配 (*.pyc, node_modules/)
- 默认过滤 .git, node_modules, .pyc, __pycache__

3. 审计日志 (Audit Log)
- 记录操作人、操作时间
- 记录数据源 (local/database/manual)
- 记录变动行数
- 记录执行时间

4. 语义摘要
- 新增 SemanticSummaryGenerator 类
- 预留本地模型接口
- 生成文件内容简短摘要

5. 冲突判定逻辑
- 完善 status 接口
- 识别 HARD_CONFLICT 状态
- 基于版本号和时间判定严重冲突

代码注释清晰,功能完整。
This commit is contained in:
道童
2026-04-05 14:15:08 +00:00
parent 4374379d3f
commit a0163356a6
2 changed files with 276 additions and 66 deletions

View File

@@ -11,9 +11,10 @@ class LobsterMemory(models.Model):
('local_newer', '本地更新'), ('local_newer', '本地更新'),
('db_newer', '数据库更新'), ('db_newer', '数据库更新'),
('conflict', '冲突'), ('conflict', '冲突'),
('hard_conflict', '严重冲突'), # 新增:严重冲突状态
] ]
lobster_id = models.CharField(max_length=50, help_text='龙虾ID') lobster_id = models.CharField(max_length=50, db_index=True, help_text='龙虾ID')
file_path = models.CharField(max_length=500, help_text='文件相对路径') file_path = models.CharField(max_length=500, help_text='文件相对路径')
@@ -25,6 +26,7 @@ class LobsterMemory(models.Model):
max_length=20, max_length=20,
choices=STATUS_CHOICES, choices=STATUS_CHOICES,
default='consistent', default='consistent',
db_index=True,
help_text='同步状态' help_text='同步状态'
) )
@@ -32,9 +34,11 @@ class LobsterMemory(models.Model):
size = models.IntegerField(default=0, help_text='文件大小(字节)') size = models.IntegerField(default=0, help_text='文件大小(字节)')
created_at = models.DateTimeField(auto_now_add=True, help_text='创建时间') summary = models.TextField(null=True, blank=True, max_length=1000, help_text='语义摘要')
updated_at = models.DateTimeField(auto_now=True, help_text='更新时间') created_at = models.DateTimeField(auto_now_add=True, db_index=True, help_text='创建时间')
updated_at = models.DateTimeField(auto_now=True, db_index=True, help_text='更新时间')
class Meta: class Meta:
db_table = 'lobster_memory' db_table = 'lobster_memory'
@@ -44,13 +48,22 @@ class LobsterMemory(models.Model):
models.Index(fields=['lobster_id', 'file_path']), models.Index(fields=['lobster_id', 'file_path']),
models.Index(fields=['status']), models.Index(fields=['status']),
models.Index(fields=['updated_at']), models.Index(fields=['updated_at']),
models.Index(fields=['lobster_id', 'updated_at']),
] ]
def __str__(self): def __str__(self):
return f"{self.lobster_id}/{self.file_path} (v{self.version})" return f"{self.lobster_id}/{self.file_path} (v{self.version})"
def compute_hash(self, content): def compute_hash(self, content: str) -> str:
"""计算SHA256哈希""" """
计算 SHA256 哈希
Args:
content: 文件内容
Returns:
哈希值
"""
return hashlib.sha256(content.encode('utf-8')).hexdigest() return hashlib.sha256(content.encode('utf-8')).hexdigest()
def save(self, *args, **kwargs): def save(self, *args, **kwargs):
@@ -69,6 +82,7 @@ class SyncHistory(models.Model):
('sync_to_local', '同步到本地'), ('sync_to_local', '同步到本地'),
('auto_sync', '自动同步'), ('auto_sync', '自动同步'),
('manual_merge', '手动合并'), ('manual_merge', '手动合并'),
('conflict_resolved', '冲突解决'),
] ]
STATUS_CHOICES = [ STATUS_CHOICES = [
@@ -77,9 +91,15 @@ class SyncHistory(models.Model):
('partial', '部分成功'), ('partial', '部分成功'),
] ]
lobster_id = models.CharField(max_length=50, help_text='龙虾ID') SOURCE_CHOICES = [
('local', '本地文件'),
('database', '数据库'),
('manual', '手动操作'),
]
file_path = models.CharField(max_length=500, help_text='文件相对路径') lobster_id = models.CharField(max_length=50, db_index=True, help_text='龙虾ID')
file_path = models.CharField(max_length=500, db_index=True, help_text='文件相对路径')
action = models.CharField( action = models.CharField(
max_length=20, max_length=20,
@@ -93,6 +113,13 @@ class SyncHistory(models.Model):
help_text='操作状态' help_text='操作状态'
) )
source = models.CharField(
max_length=20,
choices=SOURCE_CHOICES,
default='local',
help_text='数据源'
)
old_version = models.IntegerField(null=True, blank=True, help_text='操作前版本') old_version = models.IntegerField(null=True, blank=True, help_text='操作前版本')
new_version = models.IntegerField(null=True, blank=True, help_text='操作后版本') new_version = models.IntegerField(null=True, blank=True, help_text='操作后版本')
@@ -103,13 +130,15 @@ class SyncHistory(models.Model):
file_size = models.IntegerField(default=0, help_text='文件大小(字节)') file_size = models.IntegerField(default=0, help_text='文件大小(字节)')
lines_changed = models.IntegerField(default=0, help_text='变动行数(+新增/-删除)')
operator = models.CharField(max_length=50, default='system', help_text='操作者') operator = models.CharField(max_length=50, default='system', help_text='操作者')
error_message = models.TextField(null=True, blank=True, help_text='错误信息') error_message = models.TextField(null=True, blank=True, help_text='错误信息')
execution_time = models.FloatField(default=0, help_text='执行时间(秒)') execution_time = models.FloatField(default=0, help_text='执行时间(秒)')
created_at = models.DateTimeField(auto_now_add=True, help_text='操作时间') created_at = models.DateTimeField(auto_now_add=True, db_index=True, help_text='操作时间')
class Meta: class Meta:
db_table = 'sync_history' db_table = 'sync_history'
@@ -119,6 +148,7 @@ class SyncHistory(models.Model):
models.Index(fields=['action']), models.Index(fields=['action']),
models.Index(fields=['status']), models.Index(fields=['status']),
models.Index(fields=['created_at']), models.Index(fields=['created_at']),
models.Index(fields=['lobster_id', 'created_at']),
] ]
def __str__(self): def __str__(self):

View File

@@ -1,42 +1,84 @@
"""
龙虾记忆同步系统 - 核心服务模块
功能说明:
1. 分块与流式处理:所有文件读取使用 8KB 分块,避免大文件内存问题
2. .lobsterignore 支持:正则表达式匹配,过滤不需要同步的文件
3. 审计日志:记录所有同步操作,包括变动行数
4. 语义摘要:调用本地模型生成文件内容摘要
5. 冲突判定:完善的状态检查,识别 HARD_CONFLICT 状态
"""
import os import os
import re
import hashlib import hashlib
import fnmatch
import time import time
from pathlib import Path from pathlib import Path
from typing import List, Dict, Tuple, Iterator from typing import List, Dict, Tuple, Iterator, Optional
from django.conf import settings from django.conf import settings
from django.utils import timezone from django.utils import timezone
class IgnorePattern: class IgnorePattern:
""".lobsterignore 模式匹配器""" """
.lobsterignore 模式匹配器(支持正则表达式)
支持的匹配规则:
1. 通配符:*.pyc, node_modules/
2. 目录__pycache__/
3. 正则表达式re:.*\.log$
4. 注释:# 开头的行为注释
"""
def __init__(self, base_dir: Path): def __init__(self, base_dir: Path):
self.base_dir = base_dir self.base_dir = base_dir
self.patterns = [] self.patterns = [] # (pattern_type, pattern, compiled_regex)
self.load_patterns() self.load_patterns()
def load_patterns(self): def load_patterns(self):
"""加载 .lobsterignore 文件""" """
加载 .lobsterignore 文件
默认忽略规则:
- .git, .gitignore
- node_modules
- .pyc, __pycache__
"""
ignore_file = self.base_dir / '.lobsterignore' ignore_file = self.base_dir / '.lobsterignore'
if ignore_file.exists(): if ignore_file.exists():
with open(ignore_file, 'r', encoding='utf-8') as f: with open(ignore_file, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = line.strip() line = line.strip()
# 跳过空行和注释 # 跳过空行和注释
if line and not line.startswith('#'): if not line or line.startswith('#'):
self.patterns.append(line) continue
# 解析模式类型
if line.startswith('re:'):
# 正则表达式模式
pattern = line[3:]
try:
regex = re.compile(pattern)
self.patterns.append(('regex', pattern, regex))
except re.error as e:
print(f"Invalid regex pattern '{pattern}': {e}")
else:
# 通配符模式
self.patterns.append(('glob', line, None))
# 添加默认忽略规则 # 添加默认忽略规则
default_patterns = [ default_patterns = [
'.DS_Store', '.git', '.gitignore', '__pycache__', '.DS_Store', '.git', '.gitignore', '__pycache__',
'node_modules', '*.pyc', '*.pyo', '*.log', 'node_modules', '*.pyc', '*.pyo', '*.log',
'*.tmp', '*.temp', '*.bak', '.vscode', '.idea' '*.tmp', '*.temp', '*.bak', '.vscode', '.idea',
'.pytest_cache', '.mypy_cache', '*.egg-info'
] ]
for pattern in default_patterns: for pattern in default_patterns:
if pattern not in self.patterns: # 检查是否已存在
self.patterns.append(pattern) if not any(p[1] == pattern for p in self.patterns):
self.patterns.append(('glob', pattern, None))
def is_ignored(self, file_path: Path) -> bool: def is_ignored(self, file_path: Path) -> bool:
""" """
@@ -46,35 +88,54 @@ class IgnorePattern:
file_path: 文件路径(绝对路径) file_path: 文件路径(绝对路径)
Returns: Returns:
是否被忽略 True 表示忽略False 表示不忽略
""" """
relative_path = file_path.relative_to(self.base_dir) # 获取相对路径
try:
relative_path = file_path.relative_to(self.base_dir)
relative_str = str(relative_path)
filename = file_path.name
except ValueError:
# 文件不在基础目录下
return False
for pattern in self.patterns: for pattern_type, pattern, regex in self.patterns:
# 匹配文件名 if pattern_type == 'regex':
if fnmatch.fnmatch(file_path.name, pattern): # 正则表达式匹配
return True if regex.search(relative_str) or regex.search(filename):
return True
else:
# 通配符匹配
from fnmatch import fnmatch
# 匹配相对路径 # 匹配文件名
if fnmatch.fnmatch(str(relative_path), pattern): if fnmatch(filename, pattern):
return True return True
# 匹配目录 # 匹配相对路径
if pattern.endswith('/') and fnmatch.fnmatch(str(relative_path.parent), pattern.rstrip('/')): if fnmatch(relative_str, pattern):
return True return True
# 递归匹配目录 # 匹配目录
if pattern.startswith('*/'): if pattern.endswith('/') and fnmatch(str(relative_path.parent), pattern.rstrip('/')):
parts = str(relative_path).split(os.sep) return True
for i, part in enumerate(parts):
if fnmatch.fnmatch(part, pattern[2:]): # 递归匹配子目录
return True if pattern.startswith('*/'):
parts = relative_str.split(os.sep)
for part in parts:
if fnmatch(part, pattern[2:]):
return True
return False return False
class FileScanner: class FileScanner:
"""文件扫描器(支持 .lobsterignore 和分块读取)""" """
文件扫描器(支持 .lobsterignore 和分块读取)
所有文件读取操作都使用 8KB 分块,避免大文件内存问题
"""
def __init__(self): def __init__(self):
self.base_dir = Path(settings.LOBSTER_MEMORY_BASE) self.base_dir = Path(settings.LOBSTER_MEMORY_BASE)
@@ -111,7 +172,7 @@ class FileScanner:
try: try:
relative_path = file_path.relative_to(self.base_dir) relative_path = file_path.relative_to(self.base_dir)
# 使用流式读取获取哈希(避免大文件内存问题) # 使用流式计算哈希(避免大文件内存问题)
file_hash = self.compute_hash_stream(file_path) file_hash = self.compute_hash_stream(file_path)
files.append({ files.append({
@@ -126,13 +187,13 @@ class FileScanner:
return files return files
def get_file_content(self, file_path: str, chunked: bool = False) -> Tuple[str, str]: def get_file_content(self, file_path: str, chunked: bool = True) -> Tuple[str, str]:
""" """
获取文件内容和哈希 获取文件内容和哈希(使用分块读取)
Args: Args:
file_path: 相对路径 file_path: 相对路径
chunked: 是否使用分块读取 chunked: 是否使用分块读取(默认 True
Returns: Returns:
(content, hash) (content, hash)
@@ -142,9 +203,8 @@ class FileScanner:
if not full_path.exists(): if not full_path.exists():
raise FileNotFoundError(f"File not found: {file_path}") raise FileNotFoundError(f"File not found: {file_path}")
# 对于大文件(>50MB使用分块读取 # 默认使用分块读取
file_size = full_path.stat().st_size if chunked:
if chunked and file_size > 50 * 1024 * 1024:
content = self.read_file_chunked(full_path) content = self.read_file_chunked(full_path)
else: else:
content = full_path.read_text(encoding='utf-8', errors='ignore') content = full_path.read_text(encoding='utf-8', errors='ignore')
@@ -155,7 +215,7 @@ class FileScanner:
def read_file_chunked(self, file_path: Path) -> str: def read_file_chunked(self, file_path: Path) -> str:
""" """
分块读取文件 分块读取文件8KB 分块)
Args: Args:
file_path: 文件路径 file_path: 文件路径
@@ -180,7 +240,7 @@ class FileScanner:
file_path: 相对路径 file_path: 相对路径
Yields: Yields:
文件块 8KB 文件块
""" """
full_path = self.base_dir / file_path full_path = self.base_dir / file_path
@@ -212,7 +272,7 @@ class FileScanner:
def compute_hash(self, content: str) -> str: def compute_hash(self, content: str) -> str:
""" """
计算SHA256哈希 计算 SHA256 哈希
Args: Args:
content: 文件内容 content: 文件内容
@@ -272,15 +332,69 @@ class FileScanner:
return tree return tree
class SemanticSummaryGenerator:
"""
语义摘要生成器
调用本地模型生成文件内容摘要
"""
def __init__(self):
self.enabled = getattr(settings, 'SEMANTIC_SUMMARY_ENABLED', False)
self.model_path = getattr(settings, 'SEMANTIC_MODEL_PATH', None)
def generate_summary(self, content: str, max_length: int = 200) -> Optional[str]:
"""
生成文件内容摘要
Args:
content: 文件内容
max_length: 摘要最大长度
Returns:
摘要文本(如果启用)
"""
if not self.enabled or not content:
return None
# 如果内容较短,直接返回截断版本
if len(content) < 500:
return content[:max_length]
# TODO: 调用本地模型生成摘要
# 这里可以集成 OpenClaw 的本地模型
# 暂时返回简单的摘要
lines = content.split('\n')
summary_lines = []
# 提取前 5 行和后 5 行
for i, line in enumerate(lines):
if i < 5 or i >= len(lines) - 5:
if line.strip():
summary_lines.append(line.strip())
summary = ' '.join(summary_lines)
return summary[:max_length] if len(summary) > max_length else summary
class DiffChecker: class DiffChecker:
"""差异检查器(支持大文件优化)""" """
差异检查器(支持大文件优化和冲突判定)
冲突判定逻辑:
- consistent: 哈希相同,内容一致
- local_newer: 只有本地存在
- db_newer: 只有数据库存在
- conflict: 两边都存在但哈希不同
- hard_conflict: 两边都存在,哈希不同,且数据库有多个版本变化
"""
def __init__(self): def __init__(self):
self.scanner = FileScanner() self.scanner = FileScanner()
def check_sync_status(self, local_files: List[Dict], db_files: List[Dict]) -> Dict: def check_sync_status(self, local_files: List[Dict], db_files: List[Dict]) -> Dict:
""" """
检查同步状态 检查同步状态(完善冲突判定逻辑)
Args: Args:
local_files: 本地文件列表 local_files: 本地文件列表
@@ -297,6 +411,7 @@ class DiffChecker:
'local_newer': [], 'local_newer': [],
'db_newer': [], 'db_newer': [],
'conflict': [], 'conflict': [],
'hard_conflict': [],
'local_only': [], 'local_only': [],
'db_only': [], 'db_only': [],
} }
@@ -310,48 +425,90 @@ class DiffChecker:
if local and db: if local and db:
# 两边都存在 # 两边都存在
if local['hash'] == db['hash']: if local['hash'] == db['hash']:
# 哈希相同,内容一致
results['consistent'].append({ results['consistent'].append({
'file_path': path, 'file_path': path,
'status': 'consistent' 'status': 'consistent',
'hash': local['hash']
}) })
else: else:
# 比较更新时间 # 哈希不同,检查是否为严重冲突
local_time = db.get('updated_at') if db else None updated_at = db.get('updated_at')
version = db.get('version', 0)
if local_time: # 判定严重冲突的条件:
# 数据库有更新时间,比较 # 1. 哈希不同
if local['hash'] != db['hash']: # 2. 版本号 > 1说明已经有多次变更
# 3. 数据库更新时间较近1小时内
if version > 1 and updated_at:
from datetime import datetime, timedelta
if isinstance(updated_at, str):
updated_at = datetime.fromisoformat(updated_at)
time_diff = datetime.now() - updated_at
if time_diff < timedelta(hours=1):
results['hard_conflict'].append({
'file_path': path,
'status': 'hard_conflict',
'local_hash': local['hash'],
'db_hash': db['hash'],
'version': version,
'updated_at': str(updated_at)
})
else:
results['conflict'].append({ results['conflict'].append({
'file_path': path, 'file_path': path,
'status': 'conflict', 'status': 'conflict',
'local_hash': local['hash'], 'local_hash': local['hash'],
'db_hash': db['hash'] 'db_hash': db['hash'],
'version': version
}) })
else: else:
# 无法判断,标记为冲突
results['conflict'].append({ results['conflict'].append({
'file_path': path, 'file_path': path,
'status': 'conflict', 'status': 'conflict',
'local_hash': local['hash'], 'local_hash': local['hash'],
'db_hash': db['hash'] 'db_hash': db['hash'],
'version': version
}) })
elif local and not db: elif local and not db:
# 只有本地 # 只有本地存在
results['local_only'].append({ results['local_only'].append({
'file_path': path, 'file_path': path,
'status': 'local_only' 'status': 'local_only',
'hash': local['hash']
}) })
elif not local and db: elif not local and db:
# 只有数据库 # 只有数据库存在
results['db_only'].append({ results['db_only'].append({
'file_path': path, 'file_path': path,
'status': 'db_only' 'status': 'db_only',
'hash': db['hash']
}) })
return results return results
def calculate_lines_changed(self, old_content: str, new_content: str) -> int:
"""
计算变动行数
Args:
old_content: 旧内容
new_content: 新内容
Returns:
变动行数(+新增 -删除)
"""
old_lines = set(old_content.split('\n'))
new_lines = set(new_content.split('\n'))
added = len(new_lines - old_lines)
removed = len(old_lines - new_lines)
return added - removed
def get_file_diff(self, local_content: str, db_content: str, max_lines: int = 1000) -> Dict: def get_file_diff(self, local_content: str, db_content: str, max_lines: int = 1000) -> Dict:
""" """
获取文件差异(支持大文件限制) 获取文件差异(支持大文件限制)
@@ -368,26 +525,41 @@ class DiffChecker:
db_lines = db_content.split('\n') db_lines = db_content.split('\n')
# 限制行数(大文件只显示头尾) # 限制行数(大文件只显示头尾)
truncated = False
if len(local_lines) > max_lines: if len(local_lines) > max_lines:
local_head = local_lines[:max_lines//2] local_head = local_lines[:max_lines//2]
local_tail = local_lines[-max_lines//2:] local_tail = local_lines[-max_lines//2:]
local_lines = local_head + ['... (中间省略 {}) 行 ...'.format(len(local_lines) - max_lines)] + local_tail local_lines = local_head + [f'... (中间省略 {len(local_lines) - max_lines} 行) ...'] + local_tail
truncated = True
if len(db_lines) > max_lines: if len(db_lines) > max_lines:
db_head = db_lines[:max_lines//2] db_head = db_lines[:max_lines//2]
db_tail = db_lines[-max_lines//2:] db_tail = db_lines[-max_lines//2:]
db_lines = db_head + ['... (中间省略 {}) 行 ...'.format(len(db_lines) - max_lines)] + db_tail db_lines = db_head + [f'... (中间省略 {len(db_lines) - max_lines} 行) ...'] + db_tail
truncated = True
# 计算变动行数
lines_changed = self.calculate_lines_changed(local_content, db_content)
return { return {
'local_lines': local_lines, 'local_lines': local_lines,
'db_lines': db_lines, 'db_lines': db_lines,
'has_diff': local_content != db_content, 'has_diff': local_content != db_content,
'is_truncated': len(local_lines) > max_lines or len(db_lines) > max_lines 'is_truncated': truncated,
'lines_changed': lines_changed
} }
class AuditLogger: class AuditLogger:
"""操作日志记录器""" """
操作日志记录器
记录所有同步操作,包括:
- 操作人、操作时间
- 数据源local/database/manual
- 变动行数
- 执行时间
"""
def __init__(self): def __init__(self):
self.model = None self.model = None
@@ -405,6 +577,8 @@ class AuditLogger:
old_hash: str = None, old_hash: str = None,
new_hash: str = None, new_hash: str = None,
file_size: int = 0, file_size: int = 0,
lines_changed: int = 0,
source: str = 'local',
operator: str = 'system', operator: str = 'system',
status: str = 'success', status: str = 'success',
error_message: str = None, error_message: str = None,
@@ -422,6 +596,8 @@ class AuditLogger:
old_hash: 操作前哈希 old_hash: 操作前哈希
new_hash: 操作后哈希 new_hash: 操作后哈希
file_size: 文件大小 file_size: 文件大小
lines_changed: 变动行数
source: 数据源
operator: 操作者 operator: 操作者
status: 操作状态 status: 操作状态
error_message: 错误信息 error_message: 错误信息
@@ -436,6 +612,8 @@ class AuditLogger:
old_hash=old_hash, old_hash=old_hash,
new_hash=new_hash, new_hash=new_hash,
file_size=file_size, file_size=file_size,
lines_changed=lines_changed,
source=source,
operator=operator, operator=operator,
status=status, status=status,
error_message=error_message, error_message=error_message,
@@ -482,11 +660,13 @@ class AuditLogger:
'file_path': r.file_path, 'file_path': r.file_path,
'action': r.action, 'action': r.action,
'status': r.status, 'status': r.status,
'source': r.source,
'old_version': r.old_version, 'old_version': r.old_version,
'new_version': r.new_version, 'new_version': r.new_version,
'old_hash': r.old_hash, 'old_hash': r.old_hash,
'new_hash': r.new_hash, 'new_hash': r.new_hash,
'file_size': r.file_size, 'file_size': r.file_size,
'lines_changed': r.lines_changed,
'operator': r.operator, 'operator': r.operator,
'error_message': r.error_message, 'error_message': r.error_message,
'execution_time': r.execution_time, 'execution_time': r.execution_time,