Skip to content

Commit

Permalink
fix: splitText fix & processCharacterKnowledge optimization (#3763)
Browse files Browse the repository at this point in the history
* lock bump

* fix logError logging

* expose getMemoriesByIds

* processCharacterKnowledge optimization, lower logging

* notes

* getMemoriesByIds

* splitText fix
  • Loading branch information
odilitime authored Mar 4, 2025
1 parent 9253e7e commit 0cfce9e
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 61 deletions.
87 changes: 51 additions & 36 deletions packages/core/src/generation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1446,8 +1446,8 @@ export async function generateShouldRespond({
*/
export async function splitChunks(
content: string,
chunkSize = 1500,
bleed = 100
chunkSize = 1500, // in tokens
bleed = 100 // in tokens
): Promise<string[]> {
elizaLogger.debug(`[splitChunks] Starting text split`);

Expand Down Expand Up @@ -1483,23 +1483,38 @@ export async function splitChunks(
return chunks;
}

export function splitText(
content: string,
chunkSize: number,
bleed: number
): string[] {

function estimateTokensFromEnglishLength(stringLength) {
return Math.round(stringLength / 4); // Rough estimate: 1 token ≈ 4 characters in English
}

function estimateEnglishLengthFromTokens(tokenCount) {
return tokenCount * 4; // Reverse estimate: 1 token ≈ 4 characters in English
}

export function splitText(content: string, chunkSize: number, bleed: number): string[] {
// Convert chunk size and bleed from tokens to approximate character length
const chunkCharSize = estimateEnglishLengthFromTokens(chunkSize);
const bleedCharSize = estimateEnglishLengthFromTokens(bleed);

// If content is smaller than estimated chunk size, return it as a single chunk
if (content.length <= chunkCharSize) {
return [content];
}

const chunks: string[] = [];
let start = 0;

while (start < content.length) {
const end = Math.min(start + chunkSize, content.length);
// Ensure we're not creating empty or invalid chunks
if (end > start) {
chunks.push(content.substring(start, end));
}
const end = Math.min(start + chunkCharSize, content.length);
chunks.push(content.substring(start, end));

// Ensure forward progress while preventing infinite loops
start = Math.max(end - bleed, start + 1);
// Move forward by (chunkSize - bleed), converted to character length
const nextStart = start + (chunkCharSize - bleedCharSize);
if (nextStart >= content.length || nextStart <= start) {
break; // Stop if no progress is made
}
start = nextStart;
}

return chunks;
Expand Down Expand Up @@ -2362,10 +2377,10 @@ async function handleOpenAI({
}: ProviderOptions): Promise<GenerateObjectResult<unknown>> {
const endpoint = runtime.character.modelEndpointOverride || getEndpoint(provider);
const baseURL = getCloudflareGatewayBaseURL(runtime, "openai") || endpoint;
const openai = createOpenAI({
apiKey,
const openai = createOpenAI({
apiKey,
baseURL,
fetch: runtime.fetch
fetch: runtime.fetch
});
return aiGenerateObject({
model: openai.languageModel(model),
Expand Down Expand Up @@ -2401,10 +2416,10 @@ async function handleAnthropic({
const baseURL = getCloudflareGatewayBaseURL(runtime, "anthropic");
elizaLogger.debug("Anthropic handleAnthropic baseURL:", { baseURL });

const anthropic = createAnthropic({
apiKey,
const anthropic = createAnthropic({
apiKey,
baseURL,
fetch: runtime.fetch
fetch: runtime.fetch
});
return await aiGenerateObject({
model: anthropic.languageModel(model),
Expand Down Expand Up @@ -2432,10 +2447,10 @@ async function handleGrok({
modelOptions,
runtime,
}: ProviderOptions): Promise<GenerationResult> {
const grok = createOpenAI({
apiKey,
const grok = createOpenAI({
apiKey,
baseURL: models.grok.endpoint,
fetch: runtime.fetch
fetch: runtime.fetch
});
return aiGenerateObject({
model: grok.languageModel(model, { parallelToolCalls: false }),
Expand Down Expand Up @@ -2467,10 +2482,10 @@ async function handleGroq({
const baseURL = getCloudflareGatewayBaseURL(runtime, "groq");
elizaLogger.debug("Groq handleGroq baseURL:", { baseURL });

const groq = createGroq({
apiKey,
const groq = createGroq({
apiKey,
baseURL,
fetch: runtime.fetch
fetch: runtime.fetch
});
return await aiGenerateObject({
model: groq.languageModel(model),
Expand Down Expand Up @@ -2500,7 +2515,7 @@ async function handleGoogle({
}: ProviderOptions): Promise<GenerateObjectResult<unknown>> {
const google = createGoogleGenerativeAI({
apiKey,
fetch: runtime.fetch
fetch: runtime.fetch
});
return aiGenerateObject({
model: google(model),
Expand Down Expand Up @@ -2554,10 +2569,10 @@ async function handleRedPill({
modelOptions,
runtime,
}: ProviderOptions): Promise<GenerationResult> {
const redPill = createOpenAI({
apiKey,
const redPill = createOpenAI({
apiKey,
baseURL: models.redpill.endpoint,
fetch: runtime.fetch
fetch: runtime.fetch
});
return aiGenerateObject({
model: redPill.languageModel(model),
Expand Down Expand Up @@ -2647,10 +2662,10 @@ async function handleDeepSeek({
modelOptions,
runtime,
}: ProviderOptions): Promise<GenerationResult> {
const openai = createOpenAI({
apiKey,
const openai = createOpenAI({
apiKey,
baseURL: models.deepseek.endpoint,
fetch: runtime.fetch
fetch: runtime.fetch
});
return aiGenerateObject({
model: openai.languageModel(model),
Expand Down Expand Up @@ -2773,10 +2788,10 @@ async function handleNearAi({
modelOptions,
runtime,
}: ProviderOptions): Promise<GenerationResult> {
const nearai = createOpenAI({
apiKey,
const nearai = createOpenAI({
apiKey,
baseURL: models.nearai.endpoint,
fetch: runtime.fetch
fetch: runtime.fetch
});
const settings = schema ? { structuredOutputs: true } : undefined;
return aiGenerateObject({
Expand Down
9 changes: 5 additions & 4 deletions packages/core/src/knowledge.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,10 @@ async function get(
async function set(
runtime: AgentRuntime,
item: KnowledgeItem,
chunkSize = 512,
bleed = 20
chunkSize = 512, // in tokens
bleed = 20 // in tokens
) {
// create document
await runtime.documentsManager.createMemory({
id: item.id,
agentId: runtime.agentId,
Expand All @@ -80,9 +81,9 @@ async function set(
embedding: getEmbeddingZeroVector(),
});

const preprocessed = preprocess(item.content.text);
// create knowledge
const preprocessed = preprocess(item.content.text); // normalizes it (lowering case/clean up)
const fragments = await splitChunks(preprocessed, chunkSize, bleed);

for (const fragment of fragments) {
const embedding = await embed(runtime, fragment);
await runtime.knowledgeManager.createMemory({
Expand Down
4 changes: 4 additions & 0 deletions packages/core/src/memory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ export class MemoryManager implements IMemoryManager {
});
}

async getMemoriesByIds(ids: UUID[]): Promise<Memory[]> {
return this.runtime.databaseAdapter.getMemoriesByIds(ids);
}

async getMemoryById(id: UUID): Promise<Memory | null> {
const result = await this.runtime.databaseAdapter.getMemoryById(id);
if (result && result.agentId !== this.runtime.agentId) return null;
Expand Down
54 changes: 34 additions & 20 deletions packages/core/src/runtime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -365,10 +365,10 @@ export class AgentRuntime implements IAgentRuntime {

this.imageModelProvider =
this.character.imageModelProvider ?? this.modelProvider;

this.imageVisionModelProvider =
this.character.imageVisionModelProvider ?? this.modelProvider;

elizaLogger.info(
`${this.character.name}(${this.agentId}) - Selected model provider:`,
this.modelProvider
Expand Down Expand Up @@ -493,7 +493,7 @@ export class AgentRuntime implements IAgentRuntime {
elizaLogger.info(
`[RAG Check] RAG Knowledge enabled: ${this.character.settings.ragKnowledge ? true : false}`,
);
elizaLogger.info(
elizaLogger.debug(
`[RAG Check] Knowledge items:`,
this.character.knowledge,
);
Expand Down Expand Up @@ -607,28 +607,42 @@ export class AgentRuntime implements IAgentRuntime {
* @param knowledge An array of knowledge items containing id, path, and content.
*/
private async processCharacterKnowledge(items: string[]) {
for (const item of items) {
const knowledgeId = stringToUuid(item);
const existingDocument =
await this.documentsManager.getMemoryById(knowledgeId);
if (existingDocument) {
continue;
const ids = items.map(i => stringToUuid(i));
const exists = await this.documentsManager.getMemoriesByIds(ids);
const toAdd = [];
for(const i in items) {
const exist = exists[i];
if (!exist) {
toAdd.push([items[i], ids[i]]);
}
}
if (!toAdd.length) return;
elizaLogger.info('discovered ' + toAdd.length + ' new knowledge items')
const chunkSize = 512;
const ps = [];
for (const a of toAdd) {
const item = a[0];
const knowledgeId = a[1];

if (item.length > chunkSize) {
// these are just slower
elizaLogger.info(
this.character.name,
" knowledge item over 512 characters, splitting - ",
item.slice(0, 100),
);
}

elizaLogger.info(
"Processing knowledge for ",
this.character.name,
" - ",
item.slice(0, 100),
);

await knowledge.set(this, {
ps.push(knowledge.set(this, {
id: knowledgeId,
content: {
text: item,
},
});
}));
}
// wait for it all to be added
await Promise.all(ps);
elizaLogger.success(this.character.name, 'knowledge is synchronized');
}

/**
Expand Down Expand Up @@ -1796,12 +1810,12 @@ const formatKnowledge = (knowledge: KnowledgeItem[]) => {
return knowledge.map(item => {
// Get the main content text
const text = item.content.text;

// Clean up formatting but maintain natural text flow
const cleanedText = text
.trim()
.replace(/\n{3,}/g, '\n\n'); // Replace excessive newlines

return cleanedText;
}).join('\n\n'); // Separate distinct pieces with double newlines
};
1 change: 1 addition & 0 deletions packages/core/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,7 @@ export interface IMemoryManager {
content: string,
): Promise<{ embedding: number[]; levenshtein_score: number }[]>;

getMemoriesByIds(ids: UUID[]): Promise<Memory[]>;
getMemoryById(id: UUID): Promise<Memory | null>;
getMemoriesByRoomIds(params: {
roomIds: UUID[];
Expand Down
9 changes: 9 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testLibrary.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ function log(message) {
}

function logError(error) {
log(`Error: ${message}`);
log(`Error: ${error.message}`);
log(error); // Print stack trace
}

Expand Down

0 comments on commit 0cfce9e

Please sign in to comment.