From 04a798b2292dbf447e6b14022c38e07ce0b69c4d Mon Sep 17 00:00:00 2001 From: Wolfsblvt Date: Thu, 2 May 2024 02:02:12 +0200 Subject: [PATCH] Switch word counting to Segmenter --- src/endpoints/stats.js | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/endpoints/stats.js b/src/endpoints/stats.js index 640dbfbfc..0032f9e1d 100644 --- a/src/endpoints/stats.js +++ b/src/endpoints/stats.js @@ -16,8 +16,9 @@ const MIN_TIMESTAMP = 0; const MAX_TIMESTAMP = new Date('9999-12-31T23:59:59.999Z').getTime(); const MIN_DATE = new Date(MIN_TIMESTAMP); const MAX_DATE = new Date(MAX_TIMESTAMP); +const STATS_LANGUAGE = 'en'; const STATS_FILE = 'stats.json'; -const CURRENT_STATS_VERSION = '1.1'; +const CURRENT_STATS_VERSION = '1.2'; /** @type {Map} The stats collections for each user, accessable via their key - gets set/built on init */ const STATS = new Map(); @@ -676,7 +677,9 @@ function removeModelUsage(obj, model, tokens, count = 1) { * @returns {number} - The number of words in the string. */ function countWordsInString(str) { - return str.match(/\b\w+\b/g)?.length ?? 0; + const words = Array.from(new Intl.Segmenter(STATS_LANGUAGE ?? 'en', { granularity: 'word' }).segment(str)) + .filter(it => it.isWordLike); + return words.length; } /**