From 73dabd8905c9eecf60c1aeedabd9bcfe745a2975 Mon Sep 17 00:00:00 2001 From: Honey Tree Date: Sun, 17 Nov 2024 08:32:36 -0300 Subject: [PATCH] Simple implementation of caching at depth that should be useful for most use cases --- default/config.yaml | 7 +++++++ src/endpoints/backends/chat-completions.js | 19 ++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/default/config.yaml b/default/config.yaml index d95719f22..fd6f1ea18 100644 --- a/default/config.yaml +++ b/default/config.yaml @@ -168,5 +168,12 @@ claude: # (e.g {{random}} macro or lorebooks not as in-chat injections). # Otherwise, you'll just waste money on cache misses. enableSystemPromptCache: false + # Enables caching of the message history at depth (if supported). + # https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching + # -- IMPORTANT! -- + # Use with caution. Behavior may be unpredictable and no guarantees can or will be made. + # Set to an integer to specify the desired depth. 0 (which does NOT include the prefill) + # should be ideal for most use cases. + cachingAtDepth: false # -- SERVER PLUGIN CONFIGURATION -- enableServerPlugins: false diff --git a/src/endpoints/backends/chat-completions.js b/src/endpoints/backends/chat-completions.js index 800a6cc79..e30902086 100644 --- a/src/endpoints/backends/chat-completions.js +++ b/src/endpoints/backends/chat-completions.js @@ -80,6 +80,7 @@ async function sendClaudeRequest(request, response) { const apiKey = request.body.reverse_proxy ? request.body.proxy_password : readSecret(request.user.directories, SECRET_KEYS.CLAUDE); const divider = '-'.repeat(process.stdout.columns); const enableSystemPromptCache = getConfigValue('claude.enableSystemPromptCache', false) && request.body.model.startsWith('claude-3'); + let cachingAtDepth = getConfigValue('claude.cachingAtDepth', false) && request.body.model.startsWith('claude-3'); if (!apiKey) { console.log(color.red(`Claude API key is missing.\n${divider}`)); @@ -138,9 +139,25 @@ async function sendClaudeRequest(request, response) { requestBody.tools[requestBody.tools.length - 1]['cache_control'] = { type: 'ephemeral' }; } } - if (enableSystemPromptCache) { + + if (cachingAtDepth !== false) { + // There are extremely few scenarios in which caching the prefill is a good idea, it mostly just breaks everything + const messageCount = convertedPrompt.messages.length; + cachingAtDepth += convertedPrompt.messages[messageCount - 1].role === 'assistant' ? 1 : 0; + + if (messageCount - 1 - cachingAtDepth >= 0) { + convertedPrompt.messages[messageCount - 1 - cachingAtDepth]['cache_control'] = { type: 'ephemeral' }; + } + + if (messageCount - 1 - cachingAtDepth - 2 >= 0) { + convertedPrompt.messages[messageCount - 1 - cachingAtDepth - 2]['cache_control'] = { type: 'ephemeral' }; + } + } + + if (enableSystemPromptCache || cachingAtDepth !== false) { additionalHeaders['anthropic-beta'] = 'prompt-caching-2024-07-31'; } + console.log('Claude request:', requestBody); const generateResponse = await fetch(apiUrl + '/messages', {