Appearance
文本分割
分割策略选择
文本分割是将长文本分成更小、更易处理的块的过程。LangChain 4J 提供了多种分割策略:
按字符数分割
java
import dev.langchain4j.data.segmenter.CharacterTextSegmenter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.segmenter.TextSegment;
// 创建字符分割器
CharacterTextSegmenter segmenter = CharacterTextSegmenter.builder()
.maxSegmentSize(1000) // 最大分割大小
.overlapSize(100) // 重叠大小
.build();
// 分割文档
Document document = Document.from("长文本内容...");
List<TextSegment> segments = segmenter.segment(document);
System.out.println("分割后的块数:" + segments.size());
for (TextSegment segment : segments) {
System.out.println("块内容:" + segment.content());
}按句子分割
java
import dev.langchain4j.data.segmenter.SentenceTextSegmenter;
// 创建句子分割器
SentenceTextSegmenter segmenter = SentenceTextSegmenter.builder()
.maxSegmentSize(1000)
.overlapSize(100)
.build();
// 分割文档
List<TextSegment> segments = segmenter.segment(document);按段落分割
java
import dev.langchain4j.data.segmenter.ParagraphTextSegmenter;
// 创建段落分割器
ParagraphTextSegmenter segmenter = ParagraphTextSegmenter.builder()
.maxSegmentSize(1000)
.overlapSize(100)
.build();
// 分割文档
List<TextSegment> segments = segmenter.segment(document);按标记分割
java
import dev.langchain4j.data.segmenter.TokenTextSegmenter;
// 创建标记分割器
TokenTextSegmenter segmenter = TokenTextSegmenter.builder()
.maxSegmentSize(500) // 最大标记数
.overlapSize(50) // 重叠标记数
.build();
// 分割文档
List<TextSegment> segments = segmenter.segment(document);分割器配置
基本配置
java
// 配置分割器
CharacterTextSegmenter segmenter = CharacterTextSegmenter.builder()
.maxSegmentSize(1000) // 最大分割大小
.overlapSize(100) // 重叠大小
.build();高级配置
java
// 高级配置
CharacterTextSegmenter segmenter = CharacterTextSegmenter.builder()
.maxSegmentSize(1000)
.overlapSize(100)
.segmentDelimiters(".", "!", "?") // 分割分隔符
.build();分块优化
语义分块
语义分块考虑文本的语义结构,确保分割后的块保持语义完整性:
java
import dev.langchain4j.data.segmenter.SemanticTextSegmenter;
import dev.langchain4j.embedding.EmbeddingModel;
import dev.langchain4j.embedding.openai.OpenAiEmbeddingModel;
// 创建嵌入模型
EmbeddingModel embeddingModel = OpenAiEmbeddingModel.builder()
.apiKey(System.getenv("OPENAI_API_KEY"))
.build();
// 创建语义分割器
SemanticTextSegmenter segmenter = SemanticTextSegmenter.builder()
.embeddingModel(embeddingModel)
.maxSegmentSize(1000)
.overlapSize(100)
.similarityThreshold(0.8) // 相似度阈值
.build();
// 分割文档
List<TextSegment> segments = segmenter.segment(document);层次分块
层次分块先将文本分成大的块,然后再细分为更小的块:
java
import dev.langchain4j.data.segmenter.HierarchicalTextSegmenter;
// 创建层次分割器
HierarchicalTextSegmenter segmenter = HierarchicalTextSegmenter.builder()
.topLevelSegmenter(new ParagraphTextSegmenter())
.bottomLevelSegmenter(new SentenceTextSegmenter())
.maxSegmentSize(1000)
.overlapSize(100)
.build();
// 分割文档
List<TextSegment> segments = segmenter.segment(document);自定义分割器
您可以通过实现 TextSegmenter 接口来创建自定义分割器:
基本自定义分割器
java
import dev.langchain4j.data.segmenter.TextSegmenter;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.segmenter.TextSegment;
public class CustomTextSegmenter implements TextSegmenter {
private final int maxSegmentSize;
private final int overlapSize;
public CustomTextSegmenter(int maxSegmentSize, int overlapSize) {
this.maxSegmentSize = maxSegmentSize;
this.overlapSize = overlapSize;
}
@Override
public List<TextSegment> segment(Document document) {
List<TextSegment> segments = new ArrayList<>();
String content = document.content();
// 实现自定义分割逻辑
for (int i = 0; i < content.length(); i += maxSegmentSize - overlapSize) {
int end = Math.min(i + maxSegmentSize, content.length());
String segmentContent = content.substring(i, end);
TextSegment segment = TextSegment.from(segmentContent, document.metadata());
segments.add(segment);
}
return segments;
}
}高级自定义分割器
java
public class KeywordBasedSegmenter implements TextSegmenter {
private final Set<String> keywords;
private final int maxSegmentSize;
public KeywordBasedSegmenter(Set<String> keywords, int maxSegmentSize) {
this.keywords = keywords;
this.maxSegmentSize = maxSegmentSize;
}
@Override
public List<TextSegment> segment(Document document) {
List<TextSegment> segments = new ArrayList<>();
String content = document.content();
// 实现基于关键词的分割逻辑
int lastIndex = 0;
for (int i = 0; i < content.length(); i++) {
// 检查是否包含关键词
for (String keyword : keywords) {
if (content.substring(i).startsWith(keyword)) {
// 在关键词处分割
if (i > lastIndex) {
String segmentContent = content.substring(lastIndex, i);
TextSegment segment = TextSegment.from(segmentContent, document.metadata());
segments.add(segment);
}
lastIndex = i;
}
}
// 达到最大分割大小
if (i - lastIndex >= maxSegmentSize) {
String segmentContent = content.substring(lastIndex, i);
TextSegment segment = TextSegment.from(segmentContent, document.metadata());
segments.add(segment);
lastIndex = i;
}
}
// 添加最后一个块
if (lastIndex < content.length()) {
String segmentContent = content.substring(lastIndex);
TextSegment segment = TextSegment.from(segmentContent, document.metadata());
segments.add(segment);
}
return segments;
}
}分割后处理
块过滤
过滤掉不符合要求的块:
java
List<TextSegment> segments = segmenter.segment(document);
// 过滤掉太短的块
List<TextSegment> filteredSegments = segments.stream()
.filter(segment -> segment.content().length() > 100)
.collect(Collectors.toList());
System.out.println("过滤后的块数:" + filteredSegments.size());块合并
合并相关的块:
java
// 合并相似的块
List<TextSegment> mergedSegments = new ArrayList<>();
TextSegment currentSegment = segments.get(0);
for (int i = 1; i < segments.size(); i++) {
TextSegment nextSegment = segments.get(i);
if (currentSegment.content().length() + nextSegment.content().length() < 1000) {
// 合并块
String mergedContent = currentSegment.content() + " " + nextSegment.content();
currentSegment = TextSegment.from(mergedContent, currentSegment.metadata());
} else {
mergedSegments.add(currentSegment);
currentSegment = nextSegment;
}
}
mergedSegments.add(currentSegment);
System.out.println("合并后的块数:" + mergedSegments.size());最佳实践
选择合适的分割策略:
- 对于结构化文本,使用段落分割
- 对于非结构化文本,使用句子分割
- 对于需要精确控制的场景,使用字符分割
设置合理的分割大小:
- 考虑模型的上下文窗口大小
- 平衡块大小和数量
- 避免块过大或过小
使用适当的重叠:
- 重叠有助于保持上下文连续性
- 一般设置为块大小的 10-20%
语义分割:
- 对于复杂文本,使用语义分割
- 确保分割后的块保持语义完整性
后处理优化:
- 过滤掉无用的块
- 合并相关的块
- 添加块的元数据
性能考虑:
- 对于大文本,使用并行分割
- 缓存分割结果
- 避免重复分割
评估分割效果:
- 检查分割后的块是否合理
- 测试分割对后续处理的影响
- 根据实际情况调整分割策略