Files
openclaw-memory/backend/memory_app/services.py

496 lines
14 KiB
Python
Raw Normal View History

import os
import hashlib
import fnmatch
import time
from pathlib import Path
from typing import List, Dict, Tuple, Iterator
from django.conf import settings
from django.utils import timezone
class IgnorePattern:
""".lobsterignore 模式匹配器"""
def __init__(self, base_dir: Path):
self.base_dir = base_dir
self.patterns = []
self.load_patterns()
def load_patterns(self):
"""加载 .lobsterignore 文件"""
ignore_file = self.base_dir / '.lobsterignore'
if ignore_file.exists():
with open(ignore_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
# 跳过空行和注释
if line and not line.startswith('#'):
self.patterns.append(line)
# 添加默认忽略规则
default_patterns = [
'.DS_Store', '.git', '.gitignore', '__pycache__',
'node_modules', '*.pyc', '*.pyo', '*.log',
'*.tmp', '*.temp', '*.bak', '.vscode', '.idea'
]
for pattern in default_patterns:
if pattern not in self.patterns:
self.patterns.append(pattern)
def is_ignored(self, file_path: Path) -> bool:
"""
判断文件是否被忽略
Args:
file_path: 文件路径绝对路径
Returns:
是否被忽略
"""
relative_path = file_path.relative_to(self.base_dir)
for pattern in self.patterns:
# 匹配文件名
if fnmatch.fnmatch(file_path.name, pattern):
return True
# 匹配相对路径
if fnmatch.fnmatch(str(relative_path), pattern):
return True
# 匹配目录
if pattern.endswith('/') and fnmatch.fnmatch(str(relative_path.parent), pattern.rstrip('/')):
return True
# 递归匹配子目录
if pattern.startswith('*/'):
parts = str(relative_path).split(os.sep)
for i, part in enumerate(parts):
if fnmatch.fnmatch(part, pattern[2:]):
return True
return False
class FileScanner:
"""文件扫描器(支持 .lobsterignore 和分块读取)"""
def __init__(self):
self.base_dir = Path(settings.LOBSTER_MEMORY_BASE)
self.supported_extensions = settings.SUPPORTED_EXTENSIONS
self.ignore = IgnorePattern(self.base_dir)
self.chunk_size = 8192 # 8KB 分块读取
def scan_directory(self, lobster_id: str = None) -> List[Dict]:
"""
扫描目录返回所有文件信息
Args:
lobster_id: 龙虾ID可选
Returns:
文件信息列表
"""
if not self.base_dir.exists():
return []
files = []
for file_path in self.base_dir.rglob('*'):
if not file_path.is_file():
continue
# 检查文件扩展名
if file_path.suffix not in self.supported_extensions:
continue
# 检查是否被 .lobsterignore 忽略
if self.ignore.is_ignored(file_path):
continue
try:
relative_path = file_path.relative_to(self.base_dir)
# 使用流式读取获取哈希(避免大文件内存问题)
file_hash = self.compute_hash_stream(file_path)
files.append({
'file_path': str(relative_path),
'full_path': str(file_path),
'hash': file_hash,
'size': file_path.stat().st_size,
'lobster_id': lobster_id or 'unknown',
})
except Exception as e:
print(f"Error reading {file_path}: {e}")
return files
def get_file_content(self, file_path: str, chunked: bool = False) -> Tuple[str, str]:
"""
获取文件内容和哈希
Args:
file_path: 相对路径
chunked: 是否使用分块读取
Returns:
(content, hash)
"""
full_path = self.base_dir / file_path
if not full_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
# 对于大文件(>50MB使用分块读取
file_size = full_path.stat().st_size
if chunked and file_size > 50 * 1024 * 1024:
content = self.read_file_chunked(full_path)
else:
content = full_path.read_text(encoding='utf-8', errors='ignore')
file_hash = self.compute_hash(content)
return content, file_hash
def read_file_chunked(self, file_path: Path) -> str:
"""
分块读取文件
Args:
file_path: 文件路径
Returns:
文件内容
"""
content_parts = []
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
while True:
chunk = f.read(self.chunk_size)
if not chunk:
break
content_parts.append(chunk)
return ''.join(content_parts)
def read_file_stream(self, file_path: str) -> Iterator[str]:
"""
流式读取文件用于大文件传输
Args:
file_path: 相对路径
Yields:
文件块
"""
full_path = self.base_dir / file_path
if not full_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
while True:
chunk = f.read(self.chunk_size)
if not chunk:
break
yield chunk
def write_file(self, file_path: str, content: str):
"""
写入文件
Args:
file_path: 相对路径
content: 文件内容
"""
full_path = self.base_dir / file_path
# 确保目录存在
full_path.parent.mkdir(parents=True, exist_ok=True)
# 写入文件
full_path.write_text(content, encoding='utf-8')
def compute_hash(self, content: str) -> str:
"""
计算SHA256哈希
Args:
content: 文件内容
Returns:
哈希值
"""
return hashlib.sha256(content.encode('utf-8')).hexdigest()
def compute_hash_stream(self, file_path: Path) -> str:
"""
流式计算文件哈希避免大文件内存问题
Args:
file_path: 文件路径
Returns:
哈希值
"""
hash_obj = hashlib.sha256()
with open(file_path, 'rb') as f:
while True:
chunk = f.read(self.chunk_size)
if not chunk:
break
hash_obj.update(chunk)
return hash_obj.hexdigest()
def get_file_tree(self, lobster_id: str = None) -> Dict:
"""
获取文件树结构
Args:
lobster_id: 龙虾ID
Returns:
文件树字典
"""
files = self.scan_directory(lobster_id)
tree = {}
for file_info in files:
parts = Path(file_info['file_path']).parts
current = tree
for part in parts[:-1]:
if part not in current:
current[part] = {}
current = current[part]
filename = parts[-1]
current[filename] = file_info
return tree
class DiffChecker:
"""差异检查器(支持大文件优化)"""
def __init__(self):
self.scanner = FileScanner()
def check_sync_status(self, local_files: List[Dict], db_files: List[Dict]) -> Dict:
"""
检查同步状态
Args:
local_files: 本地文件列表
db_files: 数据库文件列表
Returns:
同步状态字典
"""
local_map = {f['file_path']: f for f in local_files}
db_map = {f['file_path']: f for f in db_files}
results = {
'consistent': [],
'local_newer': [],
'db_newer': [],
'conflict': [],
'local_only': [],
'db_only': [],
}
all_paths = set(local_map.keys()) | set(db_map.keys())
for path in all_paths:
local = local_map.get(path)
db = db_map.get(path)
if local and db:
# 两边都存在
if local['hash'] == db['hash']:
results['consistent'].append({
'file_path': path,
'status': 'consistent'
})
else:
# 比较更新时间
local_time = db.get('updated_at') if db else None
if local_time:
# 数据库有更新时间,比较
if local['hash'] != db['hash']:
results['conflict'].append({
'file_path': path,
'status': 'conflict',
'local_hash': local['hash'],
'db_hash': db['hash']
})
else:
# 无法判断,标记为冲突
results['conflict'].append({
'file_path': path,
'status': 'conflict',
'local_hash': local['hash'],
'db_hash': db['hash']
})
elif local and not db:
# 只有本地
results['local_only'].append({
'file_path': path,
'status': 'local_only'
})
elif not local and db:
# 只有数据库
results['db_only'].append({
'file_path': path,
'status': 'db_only'
})
return results
def get_file_diff(self, local_content: str, db_content: str, max_lines: int = 1000) -> Dict:
"""
获取文件差异支持大文件限制
Args:
local_content: 本地内容
db_content: 数据库内容
max_lines: 最大显示行数防止大文件差异过大
Returns:
差异信息
"""
local_lines = local_content.split('\n')
db_lines = db_content.split('\n')
# 限制行数(大文件只显示头尾)
if len(local_lines) > max_lines:
local_head = local_lines[:max_lines//2]
local_tail = local_lines[-max_lines//2:]
local_lines = local_head + ['... (中间省略 {}) 行 ...'.format(len(local_lines) - max_lines)] + local_tail
if len(db_lines) > max_lines:
db_head = db_lines[:max_lines//2]
db_tail = db_lines[-max_lines//2:]
db_lines = db_head + ['... (中间省略 {}) 行 ...'.format(len(db_lines) - max_lines)] + db_tail
return {
'local_lines': local_lines,
'db_lines': db_lines,
'has_diff': local_content != db_content,
'is_truncated': len(local_lines) > max_lines or len(db_lines) > max_lines
}
class AuditLogger:
"""操作日志记录器"""
def __init__(self):
self.model = None
# 延迟导入模型(避免循环导入)
from .models import SyncHistory
self.model = SyncHistory
def log_sync_action(
self,
lobster_id: str,
file_path: str,
action: str,
old_version: int = None,
new_version: int = None,
old_hash: str = None,
new_hash: str = None,
file_size: int = 0,
operator: str = 'system',
status: str = 'success',
error_message: str = None,
execution_time: float = 0
):
"""
记录同步操作
Args:
lobster_id: 龙虾ID
file_path: 文件路径
action: 操作类型
old_version: 操作前版本
new_version: 操作后版本
old_hash: 操作前哈希
new_hash: 操作后哈希
file_size: 文件大小
operator: 操作者
status: 操作状态
error_message: 错误信息
execution_time: 执行时间
"""
self.model.objects.create(
lobster_id=lobster_id,
file_path=file_path,
action=action,
old_version=old_version,
new_version=new_version,
old_hash=old_hash,
new_hash=new_hash,
file_size=file_size,
operator=operator,
status=status,
error_message=error_message,
execution_time=execution_time,
created_at=timezone.now()
)
def get_history(
self,
lobster_id: str = None,
file_path: str = None,
action: str = None,
limit: int = 100
) -> List[Dict]:
"""
获取操作历史
Args:
lobster_id: 龙虾ID可选
file_path: 文件路径可选
action: 操作类型可选
limit: 返回数量限制
Returns:
操作历史列表
"""
queryset = self.model.objects.all()
if lobster_id:
queryset = queryset.filter(lobster_id=lobster_id)
if file_path:
queryset = queryset.filter(file_path=file_path)
if action:
queryset = queryset.filter(action=action)
records = queryset.order_by('-created_at')[:limit]
return [
{
'id': r.id,
'lobster_id': r.lobster_id,
'file_path': r.file_path,
'action': r.action,
'status': r.status,
'old_version': r.old_version,
'new_version': r.new_version,
'old_hash': r.old_hash,
'new_hash': r.new_hash,
'file_size': r.file_size,
'operator': r.operator,
'error_message': r.error_message,
'execution_time': r.execution_time,
'created_at': r.created_at.isoformat(),
}
for r in records
]