@Component
@RequiredArgsConstructor
public class FileHandlerFactory implements InitializingBean {
    
    private final List<FileHandler> handlers;
    private final DefaultFileHandler defaultFileHandler;
    private final Map<String, FileHandler> handlerMap = new ConcurrentHashMap<>();
    
    @Override
    public void afterPropertiesSet() {
        initHandlers();
    }
    
    private void initHandlers() {
        for (FileHandler handler : handlers) {
            if (handler instanceof DefaultFileHandler) {
                continue;
            }
            
            String[] supportedTypes = handler.getSupportedFileTypes();
            if (supportedTypes != null) {
                for (String type : supportedTypes) {
                    if (StringUtil.isNotBlank(type)) {
                        String fileType = type.toLowerCase();
                        handlerMap.put(fileType, handler);
                    }
                }
            }
        }
    }
    
    public FileHandler getHandler(String fileType) {
        if (StringUtil.isBlank(fileType)) {
            return defaultFileHandler;
        }
        
        String type = fileType.toLowerCase();
        FileHandler handler = handlerMap.get(type);
        
        if (handler == null) {
            return defaultFileHandler;
        }
        
        return handler;
    }
}

FileHandler接口设计：

public interface FileHandler {
    
    /**
     * 获取支持的文件类型
     */
    String[] getSupportedFileTypes();
    
    /**
     * 读取文件内容
     */
    String readContent(InputStream inputStream, FileProgressTrack progressTrack);
    
    /**
     * 预览文件内容
     */
    default String previewContent(InputStream inputStream, int maxLength) {
        return readContent(inputStream, null);
    }
    
    /**
     * 验证文件格式
     */
    default boolean validateFormat(InputStream inputStream) {
        return true;
    }
}

2.2 分段处理引擎

分段策略

分段处理引擎采用策略模式设计，支持多种分段策略的动态切换和扩展。

FileSegmentFactory策略工厂：

@Component
@RequiredArgsConstructor
public class FileSegmentFactory implements InitializingBean {
    
    private final List<FileSegment> segments;
    private final Map<SegmentType, FileSegment> segmentMap = new ConcurrentHashMap<>();
    private static final SegmentType DEFAULT_SEGMENT_TYPE = SegmentType.SEMANTIC;
    
    @Override
    public void afterPropertiesSet() {
        initSegment();
    }
    
    private void initSegment() {
        for (FileSegment segment : segments) {
            segmentMap.put(segment.getType(), segment);
        }
    }
    
    public FileSegment getSegment(SegmentType type) {
        FileSegment segment = segmentMap.get(type);
        
        if (segment == null) {
            segment = segmentMap.get(DEFAULT_SEGMENT_TYPE);
            
            if (segment == null && !segments.isEmpty()) {
                segment = segments.get(0);
            }
        }
        
        return segment;
    }
}

FileSegment策略接口：

public interface FileSegment {
    
    /**
     * 获取分段器类型
     */
    SegmentType getType();
    
    /**
     * 将内容分段
     */
    List<String> segment(SegmentRequest request);
}

2.3 异步处理机制

异步设计

异步处理机制采用CompletableFuture和线程池技术，实现大文件的后台处理和进度跟踪。

异步处理服务实现：

@Service
@RequiredArgsConstructor
public class AiKnowledgeAssetsSegmentAsyncServiceImpl 
    implements IAiKnowledgeAssetsSegmentAsyncService {
    
    private final KnowledgeFileService knowledgeFileService;
    private final KnowledgeSegmentService knowledgeSegmentService;
    private final SegmentProperties segmentProperties;
    
    @Override
    public CompletableFuture<Long> processSegmentsAsync(
        Long assetsId, String segmentType, int segmentLength, String segmentSymbol) {
        
        // 创建任务记录
        Long taskId = taskService.createTask(knowledgeId, assetsId, segmentType);
        
        // 返回任务ID的Future
        CompletableFuture<Long> idFuture = CompletableFuture.completedFuture(taskId);
        
        // 异步执行处理逻辑
        CompletableFuture.runAsync(() -> {
            try {
                processSegmentsInternal(assetsId, segmentType, segmentLength, 
                    segmentSymbol, taskId);
            } catch (Exception e) {
                taskService.failTask(taskId, "处理异常: " + e.getMessage());
            }
        }, ForkJoinPool.commonPool());
        
        return idFuture;
    }
    
    private void processSegmentsInternal(Long assetsId, String segmentType, 
        int segmentLength, String segmentSymbol, Long taskId) {
        
        // 创建进度跟踪
        TaskProgressTrack progressTrack = new TaskProgressTrack(taskId, taskService);
        
        // 读取文件内容
        String fileContent = knowledgeFileService.readFileContent(
            assets.getAssetsUrl(), assets.getAssetsType(), 
            progressTrack, PROGRESS_UPDATE_THRESHOLD);
        
        // 分段处理
        List<String> contentList = knowledgeSegmentService.segmentContent(
            SegmentRequest.builder()
                .segmentType(segmentTypeEnum)
                .content(fileContent)
                .segmentLength(segmentLength)
                .segmentSymbol(segmentSymbol)
                .build());
        
        // 批量保存分段
        int totalSize = contentList.size();
        int processedCount = 0;
        
        for (int i = 0; i < totalSize; i += PROGRESS_BATCH_SIZE) {
            int endIndex = Math.min(i + PROGRESS_BATCH_SIZE, totalSize);
            List<String> batchList = new ArrayList<>(contentList.subList(i, endIndex));
            
            segmentService.batchSaveSegments(knowledgeId, assetsId, batchList);
            
            processedCount += batchList.size();
            taskService.updateTaskProgress(taskId, processedCount, totalSize);
        }
        
        // 完成任务
        taskService.completeTask(taskId);
        
        // 启动向量化
        vectorizationService.processAssetsVectorsAsync(assetsId, taskId);
    }
}

2.4 进度跟踪系统

进度跟踪

进度跟踪系统采用观察者模式，实现处理过程的实时监控和状态更新。

进度跟踪接口设计：

public interface FileProgressTrack {
    
    /**
     * 进度更新回调
     */
    void onProgress(long bytesRead, long totalBytes, int linesRead);
    
    /**
     * 处理完成回调
     */
    void onComplete(long totalBytesRead, int totalLinesRead);
}

任务进度跟踪实现：

@Slf4j
public class TaskProgressTrack implements FileProgressTrack {
    
    private final Long taskId;
    private final IAiKnowledgeAssetsSegmentTaskService taskService;
    
    public TaskProgressTrack(Long taskId, 
        IAiKnowledgeAssetsSegmentTaskService taskService) {
        this.taskId = taskId;
        this.taskService = taskService;
    }
    
    @Override
    public void onProgress(long bytesRead, long totalBytes, int linesRead) {
        try {
            taskService.updateFileReadProgress(taskId, linesRead);
        } catch (Exception e) {
            log.error("更新文件读取进度失败", e);
        }
    }
    
    @Override
    public void onComplete(long totalBytesRead, int totalLinesRead) {
        try {
            taskService.completeFileRead(taskId, totalLinesRead);
        } catch (Exception e) {
            log.error("更新文件读取完成状态失败", e);
        }
    }
}

三、处理流程详解

3.1 文档处理流程

处理流程

文档处理流程包括文件上传、格式识别、内容提取、进度跟踪等多个环节。

文档处理时序图：

3.2 分段处理流程

分段流程

分段处理流程根据不同的分段策略，将文档内容分割成适合向量化的文本片段。

分段处理时序图：

3.3 向量化处理流程

向量化流程

向量化处理流程将分段后的文本转换为向量表示，并存储到向量数据库中。

向量化处理时序图：

四、设计模式应用

4.1 工厂模式

工厂模式

工厂模式用于创建和管理文档处理器和分段器，支持动态扩展和配置。

工厂模式优势：

解耦创建逻辑：将对象创建与使用分离
支持扩展：新增处理器无需修改现有代码
统一管理：集中管理所有处理器实例
配置灵活：支持运行时动态配置

4.2 策略模式

策略模式

策略模式用于实现不同的分段策略，支持算法的动态切换和扩展。

策略模式优势：

算法独立：每种分段策略独立实现
易于扩展：新增策略不影响现有代码
运行时切换：支持动态选择分段策略
代码复用：通用逻辑可以复用

4.3 观察者模式

观察者模式

观察者模式用于实现进度跟踪，支持处理过程的实时监控。

观察者模式优势：

松耦合：观察者与被观察者松耦合
动态关系：支持运行时添加/移除观察者
广播通信：一对多的通信机制
扩展性强：易于添加新的观察者

4.4 模板方法模式

模板方法

模板方法模式用于定义处理流程的骨架，子类实现具体的处理逻辑。

模板方法优势：

流程统一：统一的处理流程框架
代码复用：公共逻辑在父类实现
扩展灵活：子类只需实现特定逻辑
维护简单：流程变更只需修改模板

五、性能优化设计

5.1 异步处理优化

异步优化

异步处理优化通过线程池、批量处理等技术，提高系统的并发处理能力。

异步处理优化策略：

// 线程池配置
@Configuration
public class AsyncConfig {
    
    @Bean("knowledgeTaskExecutor")
    public TaskExecutor knowledgeTaskExecutor() {
        ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
        executor.setCorePoolSize(4);
        executor.setMaxPoolSize(8);
        executor.setQueueCapacity(100);
        executor.setThreadNamePrefix("knowledge-");
        executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
        executor.initialize();
        return executor;
    }
}

// 批量处理优化
public class BatchProcessor {
    
    private static final int BATCH_SIZE = 500;
    
    public void processBatch(List<String> segments, Long assetsId) {
        int totalSize = segments.size();
        
        for (int i = 0; i < totalSize; i += BATCH_SIZE) {
            int endIndex = Math.min(i + BATCH_SIZE, totalSize);
            List<String> batch = segments.subList(i, endIndex);
            
            // 批量保存
            segmentService.batchSaveSegments(assetsId, batch);
            
            // 更新进度
            updateProgress(endIndex, totalSize);
        }
    }
}

5.2 内存管理优化

内存优化

内存管理优化通过流式处理、及时释放等技术，避免大文件处理时的内存溢出。

内存优化策略：

// 流式文件读取
public class StreamFileReader {
    
    public String readFileContent(InputStream inputStream, 
        FileProgressTrack progressTrack) {
        
        StringBuilder content = new StringBuilder();
        
        try (BufferedReader reader = new BufferedReader(
            new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
            
            String line;
            int lineCount = 0;
            long bytesRead = 0;
            
            while ((line = reader.readLine()) != null) {
                content.append(line).append("\n");
                lineCount++;
                bytesRead += line.getBytes(StandardCharsets.UTF_8).length;
                
                // 定期更新进度
                if (lineCount % PROGRESS_UPDATE_THRESHOLD == 0) {
                    if (progressTrack != null) {
                        progressTrack.onProgress(bytesRead, -1, lineCount);
                    }
                }
                
                // 内存检查
                if (content.length() > MAX_CONTENT_SIZE) {
                    throw new RuntimeException("文件内容过大，超出处理限制");
                }
            }
            
            if (progressTrack != null) {
                progressTrack.onComplete(bytesRead, lineCount);
            }
            
        } catch (IOException e) {
            throw new RuntimeException("读取文件失败", e);
        }
        
        return content.toString();
    }
}

5.3 缓存优化

缓存优化

缓存优化通过多级缓存、智能失效等技术，提高系统的响应速度。

缓存优化实现：

// 多级缓存设计
@Component
public class KnowledgeCache {
    
    // 本地缓存
    private final Cache<String, Object> localCache = Caffeine.newBuilder()
        .maximumSize(1000)
        .expireAfterWrite(Duration.ofMinutes(30))
        .build();
    
    // Redis缓存
    @Autowired
    private RedisTemplate<String, Object> redisTemplate;
    
    public <T> T get(String key, Class<T> type) {
        // 先查本地缓存
        T value = (T) localCache.getIfPresent(key);
        if (value != null) {
            return value;
        }
        
        // 再查Redis缓存
        value = (T) redisTemplate.opsForValue().get(key);
        if (value != null) {
            localCache.put(key, value);
            return value;
        }
        
        return null;
    }
    
    public void put(String key, Object value, Duration expiration) {
        // 同时更新本地缓存和Redis缓存
        localCache.put(key, value);
        redisTemplate.opsForValue().set(key, value, expiration);
    }
    
    public void evict(String key) {
        // 同时清除本地缓存和Redis缓存
        localCache.invalidate(key);
        redisTemplate.delete(key);
    }
}

六、监控和诊断

6.1 性能监控

性能监控

性能监控通过指标收集、实时监控等技术，提供系统运行状态的可视化展示。

监控指标设计：

// 性能指标收集
@Component
public class KnowledgeMetrics {
    
    private final MeterRegistry meterRegistry;
    
    // 文档处理指标
    private final Timer documentProcessTimer;
    private final Counter documentProcessCounter;
    
    // 分段处理指标
    private final Timer segmentProcessTimer;
    private final Counter segmentProcessCounter;
    
    // 向量化指标
    private final Timer vectorizeTimer;
    private final Counter vectorizeCounter;
    
    public KnowledgeMetrics(MeterRegistry meterRegistry) {
        this.meterRegistry = meterRegistry;
        
        this.documentProcessTimer = Timer.builder("knowledge.document.process.time")
            .description("文档处理耗时")
            .register(meterRegistry);
            
        this.documentProcessCounter = Counter.builder("knowledge.document.process.count")
            .description("文档处理次数")
            .register(meterRegistry);
    }
    
    public void recordDocumentProcess(Duration duration, boolean success) {
        documentProcessTimer.record(duration);
        documentProcessCounter.increment(
            Tags.of("status", success ? "success" : "failure"));
    }
}

6.2 异常诊断

异常诊断

异常诊断通过日志分析、错误统计等技术，帮助快速定位和解决问题。

异常诊断实现：

// 异常统计和分析
@Component
public class ExceptionAnalyzer {
    
    private final Map<String, AtomicInteger> exceptionCounters = new ConcurrentHashMap<>();
    
    public void recordException(String exceptionType, String message) {
        // 统计异常次数
        exceptionCounters.computeIfAbsent(exceptionType, k -> new AtomicInteger(0))
            .incrementAndGet();
        
        // 记录详细日志
        log.error("知识库处理异常: 类型={}, 消息={}", exceptionType, message);
        
        // 发送告警（如果异常次数过多）
        if (exceptionCounters.get(exceptionType).get() > 10) {
            sendAlert(exceptionType, message);
        }
    }
    
    public Map<String, Integer> getExceptionStatistics() {
        return exceptionCounters.entrySet().stream()
            .collect(Collectors.toMap(
                Map.Entry::getKey,
                entry -> entry.getValue().get()
            ));
    }
}

通过以上完整的架构设计，BladeX AI知识库模块实现了高性能、高可用的文档处理和向量化能力，为企业级AI应用提供了坚实的技术基础。