diff --git a/src/lib/cld2/Makefile b/src/lib/cld2/Makefile index f69c7a2..54f311f 100644 --- a/src/lib/cld2/Makefile +++ b/src/lib/cld2/Makefile @@ -20,7 +20,7 @@ WEBIDL ?= $(PYTHON2) $(EMSCRIPTEN_ROOT)/tools/webidl_binder.py # analyzing one 20MB web page gives us a 30-40MB heap for the life of the # worker. FLAGS=-s -O3 -s INLINING_LIMIT=1 -s NO_FILESYSTEM=1 -s NO_EXIT_RUNTIME=1 -s INVOKE_RUN=0 \ - -s TOTAL_STACK=8192 -s TOTAL_MEMORY=2097152 -s ALLOW_MEMORY_GROWTH=1 \ + -s TOTAL_STACK=65536 -s TOTAL_MEMORY=2097152 -s ALLOW_MEMORY_GROWTH=1 \ -s MODULARIZE=1 -s EXPORT_NAME=loadCLD2 \ --closure 1 diff --git a/src/lib/cld2/cld.cpp b/src/lib/cld2/cld.cpp index cd66d4e..dc7d9fe 100644 --- a/src/lib/cld2/cld.cpp +++ b/src/lib/cld2/cld.cpp @@ -49,6 +49,14 @@ void EMSCRIPTEN_KEEPALIVE emscripten_bind_LanguageGuess___destroy___0(LanguageGu // Interface: LanguageInfo +LanguageInfo* EMSCRIPTEN_KEEPALIVE emscripten_bind_LanguageInfo_detectLanguageWithLength_3(char* buffer, int bufferLength, bool isPlainText) { + return LanguageInfo::detectLanguageWithLength(buffer, bufferLength, isPlainText); +} + +LanguageInfo* EMSCRIPTEN_KEEPALIVE emscripten_bind_LanguageInfo_detectLanguageWithLength_6(char* buffer, int bufferLength, bool isPlainText, char* tldHint, int encodingHint, char* languageHint) { + return LanguageInfo::detectLanguageWithLength(buffer, bufferLength, isPlainText, tldHint, encodingHint, languageHint); +} + LanguageInfo* EMSCRIPTEN_KEEPALIVE emscripten_bind_LanguageInfo_detectLanguage_2(char* buffer, bool isPlainText) { return LanguageInfo::detectLanguage(buffer, isPlainText); } diff --git a/src/lib/cld2/cld.idl b/src/lib/cld2/cld.idl index e426dd3..72821d3 100644 --- a/src/lib/cld2/cld.idl +++ b/src/lib/cld2/cld.idl @@ -32,6 +32,12 @@ interface LanguageGuess { }; interface LanguageInfo { + static LanguageInfo detectLanguageWithLength(DOMString buffer, long bufferLength, boolean isPlainText); + + static LanguageInfo detectLanguageWithLength(DOMString buffer, long bufferLength, boolean isPlainText, + DOMString? tldHint, long encodingHint, + DOMString? languageHint); + static LanguageInfo detectLanguage(DOMString buffer, boolean isPlainText); static LanguageInfo detectLanguage(DOMString buffer, boolean isPlainText, diff --git a/src/lib/cld2/cld.js b/src/lib/cld2/cld.js index 03552e8..0cdc6f4 100644 --- a/src/lib/cld2/cld.js +++ b/src/lib/cld2/cld.js @@ -268,6 +268,24 @@ LanguageInfo.prototype.constructor = LanguageInfo; LanguageInfo.prototype.__class__ = LanguageInfo; LanguageInfo.__cache__ = {}; Module['LanguageInfo'] = LanguageInfo; +/** @suppress {undefinedVars, duplicate} @this{Object} */ +LanguageInfo.prototype['detectLanguageWithLength'] = LanguageInfo.prototype.detectLanguageWithLength = function(buffer, bufferLength, isPlainText, tldHint, encodingHint, languageHint) { + ensureCache.prepare(); + if (buffer && typeof buffer === 'object') buffer = buffer.ptr; + else buffer = ensureString(buffer); + if (bufferLength && typeof bufferLength === 'object') bufferLength = bufferLength.ptr; + if (isPlainText && typeof isPlainText === 'object') isPlainText = isPlainText.ptr; + if (tldHint && typeof tldHint === 'object') tldHint = tldHint.ptr; + else tldHint = ensureString(tldHint); + if (encodingHint && typeof encodingHint === 'object') encodingHint = encodingHint.ptr; + if (languageHint && typeof languageHint === 'object') languageHint = languageHint.ptr; + else languageHint = ensureString(languageHint); + if (tldHint === undefined) { return wrapPointer(_emscripten_bind_LanguageInfo_detectLanguageWithLength_3(buffer, bufferLength, isPlainText), LanguageInfo) } + if (encodingHint === undefined) { return wrapPointer(_emscripten_bind_LanguageInfo_detectLanguageWithLength_4(buffer, bufferLength, isPlainText, tldHint), LanguageInfo) } + if (languageHint === undefined) { return wrapPointer(_emscripten_bind_LanguageInfo_detectLanguageWithLength_5(buffer, bufferLength, isPlainText, tldHint, encodingHint), LanguageInfo) } + return wrapPointer(_emscripten_bind_LanguageInfo_detectLanguageWithLength_6(buffer, bufferLength, isPlainText, tldHint, encodingHint, languageHint), LanguageInfo); +}; + /** @suppress {undefinedVars, duplicate} @this{Object} */ LanguageInfo.prototype['detectLanguage'] = LanguageInfo.prototype.detectLanguage = function(buffer, isPlainText, tldHint, encodingHint, languageHint) { ensureCache.prepare(); diff --git a/src/lib/cld2/cld2.js b/src/lib/cld2/cld2.js index 41e7a8d..53671bc 100644 --- a/src/lib/cld2/cld2.js +++ b/src/lib/cld2/cld2.js @@ -1,47 +1,48 @@ -var loadCLD2=(()=>{var _scriptName=globalThis.document?.currentScript?.src;return async function(moduleArg={}){var moduleRtn;function aa(a){var b=0;return function(){return b>>0)+"_",f=0;return b}); -r("Symbol.iterator",function(a){if(a)return a;a=Symbol("Symbol.iterator");for(var b="Array Int8Array Uint8Array Uint8ClampedArray Int16Array Uint16Array Int32Array Uint32Array Float32Array Float64Array".split(" "),c=0;c=h}});r("String.prototype.codePointAt",function(a){return a?a:function(b){var c=ia(this,null,"codePointAt"),d=c.length;b=Number(b)||0;if(0<=b&&bf||56319b||57343=d);)++c;if(16f?d+=String.fromCharCode(f):(f-=65536,d+=String.fromCharCode(55296|f>>10,56320|f&1023))}}else d+=String.fromCharCode(f)}return d}function La(a){return a?Ka(K,a):""} -var Ma=[];E.print&&(ra=E.print);E.printErr&&(I=E.printErr);E.wasmBinary&&(J=E.wasmBinary);if(E.preInit)for("function"==typeof E.preInit&&(E.preInit=[E.preInit]);0>2]=0;L[d.A+4>>2]=b;L[d.A+8>>2]=c;Ga=a;Ha++;throw Ga;},c:function(){return N("")},e:function(a,b){throw"Array index "+a+" out of bounds: [0,"+b+")";},d:function(a){var b=K.length;a>>>=0;if(2147483648=c;c*=2){var d=b*(1+.2/c);d=Math.min(d,a+100663296);a:{d=(Math.min(2147483648,65536*Math.ceil(Math.max(a,d)/65536))-M.buffer.byteLength+65535)/65536|0;try{M.grow(d);xa();var f=1;break a}catch(h){}f= -void 0}if(f)return!0}return!1},b:function(a,b,c,d){for(var f=0,h=0;h>2],k=L[b+4>>2];b+=8;for(var m=0;m>2]=f;return 0}},P; -P=await (function(){function a(f){f=P=f.exports;E._webidl_free=f.h;E._webidl_malloc=f.i;Na=E._emscripten_bind_Language_getLanguageCode_0=f.j;Oa=E._emscripten_bind_Language___destroy___0=f.k;Pa=E._emscripten_bind_VoidPtr___destroy___0=f.l;Qa=E._emscripten_bind_LanguageGuess_getPercent_0=f.m;Ra=E._emscripten_bind_LanguageGuess_getLanguageCode_0=f.n;Sa=E._emscripten_bind_LanguageGuess___destroy___0=f.o;Ta=E._emscripten_bind_LanguageInfo_detectLanguage_2=f.p;Ua=E._emscripten_bind_LanguageInfo_detectLanguage_5= -f.q;Va=E._emscripten_bind_LanguageInfo_getIsReliable_0=f.r;Wa=E._emscripten_bind_LanguageInfo_getLanguageCode_0=f.s;Xa=E._emscripten_bind_LanguageInfo_get_languages_1=f.t;Ya=E._emscripten_bind_LanguageInfo___destroy___0=f.u;M=f.f;xa();return P}var b,c,d;return D(function(f){if(1==f.v){b={a:Za};if(E.instantiateWasm)return f.return(new Promise(function(h){E.instantiateWasm(b,function(e,k){h(a(e,k))})}));null!=O||(O=E.locateFile?E.locateFile("cld2.wasm",F):F+"cld2.wasm");return x(f,Aa(b),2)}c=f.C;d= -a(c.instance);return f.return(d)})}()); -(function(){function a(){E.calledRun=!0;if(!sa){wa=!0;Ba(Ma);P.g();var b;null==(b=ta)||b(E);var c;null==(c=E.onRuntimeInitialized)||c.call(E);if(E.postRun)for("function"==typeof E.postRun&&(E.postRun=[E.postRun]);E.postRun.length;)b=E.postRun.shift(),Ca.push(b);Ba(Ca)}}if(E.preRun)for("function"==typeof E.preRun&&(E.preRun=[E.preRun]);E.preRun.length;)Ea();Ba(Da);E.setStatus?(E.setStatus("Running..."),setTimeout(function(){setTimeout(function(){return E.setStatus("")},1);a()},1)):a()})(); -function Q(){}Q.prototype=Object.create(Q.prototype);Q.prototype.constructor=Q;Q.prototype.I=Q;Q.K={};E.WrapperObject=Q;function $a(a){return(a||Q).K}E.getCache=$a;function R(a,b){var c=$a(b),d=c[a];if(d)return d;d=Object.create((b||Q).prototype);d.A=a;return c[a]=d}E.wrapPointer=R;E.castObject=function(a,b){return R(a.A,b)};E.NULL=R(0);E.destroy=function(a){if(!a.__destroy__)throw"Error: Cannot destroy object. (Did you create it yourself?)";a.__destroy__();delete $a(a.I)[a.A]}; -E.compare=function(a,b){return a.A===b.A};E.getPointer=function(a){return a.A};E.getClass=function(a){return a.I}; -var S={buffer:0,size:0,M:0,O:[],L:0,P:function(){if(S.L){for(var a=0;a=S.size?(0=d?b++:2047>=d?b+=2:55296<=d&&57343>=d?(b+=4,++c):b+=3}b=Array(b+1);d=b.length;c=0;if(0=h){if(c>=d)break;b[c++]=h}else if(2047>=h){if(c+1>=d)break;b[c++]=192|h>>6;b[c++]=128|h&63}else if(65535>=h){if(c+2>=d)break;b[c++]=224|h>>12;b[c++]=128|h>>6&63;b[c++]=128|h&63}else{if(c+3>=d)break;b[c++]=240|h>>18;b[c++]=128|h>>12&63;b[c++]=128| -h>>6&63;b[c++]=128|h&63;f++}}b[c]=0}a=S.alloc(b,va);for(c=0;c{var _scriptName=globalThis.document?.currentScript?.src;return async function(moduleArg={}){var moduleRtn;function ba(a){var b=0;return function(){return b>>0)+"_",e=0;return b}); +r("Symbol.iterator",function(a){if(a)return a;a=Symbol("Symbol.iterator");for(var b="Array Int8Array Uint8Array Uint8ClampedArray Int16Array Uint16Array Int32Array Uint32Array Float32Array Float64Array".split(" "),c=0;c=g}});r("String.prototype.codePointAt",function(a){return a?a:function(b){var c=ja(this,null,"codePointAt"),d=c.length;b=Number(b)||0;if(0<=b&&be||56319b||57343=d);)++c;if(16e?d+=String.fromCharCode(e):(e-=65536,d+=String.fromCharCode(55296|e>>10,56320|e&1023))}}else d+=String.fromCharCode(e)}return d}function Ma(a){return a?La(K,a):""} +var Na=[];E.print&&(sa=E.print);E.printErr&&(I=E.printErr);E.wasmBinary&&(J=E.wasmBinary);if(E.preInit)for("function"==typeof E.preInit&&(E.preInit=[E.preInit]);0>2]=0;L[d.B+4>>2]=b;L[d.B+8>>2]=c;Ha=a;Ia++;throw Ha;},c:function(){return N("")},e:function(a,b){throw"Array index "+a+" out of bounds: [0,"+b+")";},d:function(a){var b=K.length;a>>>=0;if(2147483648=c;c*=2){var d=b*(1+.2/c);d=Math.min(d,a+100663296);a:{d=(Math.min(2147483648,65536*Math.ceil(Math.max(a,d)/65536))-M.buffer.byteLength+65535)/65536|0;try{M.grow(d);ya();var e=1;break a}catch(g){}e= +void 0}if(e)return!0}return!1},b:function(a,b,c,d){for(var e=0,g=0;g>2],k=L[b+4>>2];b+=8;for(var m=0;m>2]=e;return 0}},P; +P=await (function(){function a(e){e=P=e.exports;E._webidl_free=e.h;E._webidl_malloc=e.i;Oa=E._emscripten_bind_Language_getLanguageCode_0=e.j;Pa=E._emscripten_bind_Language___destroy___0=e.k;Qa=E._emscripten_bind_VoidPtr___destroy___0=e.l;Ra=E._emscripten_bind_LanguageGuess_getPercent_0=e.m;Sa=E._emscripten_bind_LanguageGuess_getLanguageCode_0=e.n;Ta=E._emscripten_bind_LanguageGuess___destroy___0=e.o;Ua=E._emscripten_bind_LanguageInfo_detectLanguageWithLength_3=e.p;Va=E._emscripten_bind_LanguageInfo_detectLanguageWithLength_6= +e.q;Wa=E._emscripten_bind_LanguageInfo_detectLanguage_2=e.r;Xa=E._emscripten_bind_LanguageInfo_detectLanguage_5=e.s;Ya=E._emscripten_bind_LanguageInfo_getIsReliable_0=e.t;Za=E._emscripten_bind_LanguageInfo_getLanguageCode_0=e.u;$a=E._emscripten_bind_LanguageInfo_get_languages_1=e.v;ab=E._emscripten_bind_LanguageInfo___destroy___0=e.w;M=e.f;ya();return P}var b,c,d;return D(function(e){if(1==e.A){b={a:bb};if(E.instantiateWasm)return e.return(new Promise(function(g){E.instantiateWasm(b,function(f,k){g(a(f, +k))})}));null!=O||(O=E.locateFile?E.locateFile("cld2.wasm",F):F+"cld2.wasm");return w(e,Ba(b),2)}c=e.D;d=a(c.instance);return e.return(d)})}()); +(function(){function a(){E.calledRun=!0;if(!ta){xa=!0;Ca(Na);P.g();var b;null==(b=ua)||b(E);var c;null==(c=E.onRuntimeInitialized)||c.call(E);if(E.postRun)for("function"==typeof E.postRun&&(E.postRun=[E.postRun]);E.postRun.length;)b=E.postRun.shift(),Da.push(b);Ca(Da)}}if(E.preRun)for("function"==typeof E.preRun&&(E.preRun=[E.preRun]);E.preRun.length;)Fa();Ca(Ea);E.setStatus?(E.setStatus("Running..."),setTimeout(function(){setTimeout(function(){return E.setStatus("")},1);a()},1)):a()})(); +function Q(){}Q.prototype=Object.create(Q.prototype);Q.prototype.constructor=Q;Q.prototype.J=Q;Q.L={};E.WrapperObject=Q;function cb(a){return(a||Q).L}E.getCache=cb;function R(a,b){var c=cb(b),d=c[a];if(d)return d;d=Object.create((b||Q).prototype);d.B=a;return c[a]=d}E.wrapPointer=R;E.castObject=function(a,b){return R(a.B,b)};E.NULL=R(0);E.destroy=function(a){if(!a.__destroy__)throw"Error: Cannot destroy object. (Did you create it yourself?)";a.__destroy__();delete cb(a.J)[a.B]}; +E.compare=function(a,b){return a.B===b.B};E.getPointer=function(a){return a.B};E.getClass=function(a){return a.J}; +var S={buffer:0,size:0,N:0,R:[],M:0,P:function(){if(S.M){for(var a=0;a=S.size?(0=d?b++:2047>=d?b+=2:55296<=d&&57343>=d?(b+=4,++c):b+=3}b=Array(b+1);d=b.length;c=0;if(0=g){if(c>=d)break;b[c++]=g}else if(2047>=g){if(c+1>=d)break;b[c++]=192|g>>6;b[c++]=128|g&63}else if(65535>=g){if(c+2>=d)break;b[c++]=224|g>>12;b[c++]=128|g>>6&63;b[c++]=128|g&63}else{if(c+3>=d)break;b[c++]=240|g>>18;b[c++]=128|g>>12&63;b[c++]=128|g>> +6&63;b[c++]=128|g&63;e++}}b[c]=0}a=S.alloc(b,wa);for(c=0;cloadCLD2); diff --git a/src/lib/cld2/cld2.wasm b/src/lib/cld2/cld2.wasm index d436d53..7ea55bf 100755 Binary files a/src/lib/cld2/cld2.wasm and b/src/lib/cld2/cld2.wasm differ diff --git a/src/lib/cld2/cldapp.cc b/src/lib/cld2/cldapp.cc index 4750cc5..b6b8b9e 100644 --- a/src/lib/cld2/cldapp.cc +++ b/src/lib/cld2/cldapp.cc @@ -36,13 +36,50 @@ private: class LanguageInfo : public Language { public: + static LanguageInfo* detectLanguageWithLength(const char* buffer, int bufferLength, bool isPlainText) + { + CLD2::Language languages[MAX_RESULTS] = {}; + int percentages[MAX_RESULTS] = {}; + bool isReliable = false; + + int textBytes; + + CLD2::Language bestGuess = DetectLanguageSummary( + buffer, bufferLength, isPlainText, + languages, percentages, &textBytes, + &isReliable); + + return new LanguageInfo(isReliable, bestGuess, languages, percentages); + } + + static LanguageInfo* detectLanguageWithLength(const char* buffer, int bufferLength, bool isPlainText, + const char* tldHint, int encodingHint, + const char* languageHint) + { + CLD2::CLDHints hints = {languageHint, tldHint, encodingHint, CLD2::UNKNOWN_LANGUAGE}; + + CLD2::Language languages[MAX_RESULTS] = {}; + int percentages[MAX_RESULTS] = {}; + bool isReliable = false; + + double scores[MAX_RESULTS]; + int textBytes; + + CLD2::Language bestGuess = ExtDetectLanguageSummary( + buffer, bufferLength, isPlainText, + &hints, 0, + languages, percentages, scores, + nullptr, &textBytes, &isReliable); + + return new LanguageInfo(isReliable, bestGuess, languages, percentages); + } + static LanguageInfo* detectLanguage(const char* buffer, bool isPlainText) { CLD2::Language languages[MAX_RESULTS] = {}; int percentages[MAX_RESULTS] = {}; bool isReliable = false; - // This is ignored. int textBytes; CLD2::Language bestGuess = DetectLanguageSummary( diff --git a/src/server/index.ts b/src/server/index.ts index ffa2af5..23d55e2 100644 --- a/src/server/index.ts +++ b/src/server/index.ts @@ -12,6 +12,7 @@ import swaggerDocument from '@/generated/swagger.json'; import { uiStatic } from '@/middleware/ui.js'; import { swaggerStatic } from '@/middleware/swagger.js'; import { checkForUpdate } from '@/utils/update-checker.js'; +import { VERSION } from '@/version'; export async function run() { const config = getConfig(); @@ -32,7 +33,7 @@ export async function run() { app.use(express.json()); app.use(cors()); if (config.logRequests) { - app.use(requestLogger()); + app.use(requestLogger()); } RegisterRoutes(app); @@ -58,6 +59,7 @@ export async function run() { app.use(errorHandler()); const server = app.listen(parseInt(config.port), config.host, () => { + logger.important(`MTranServer v${VERSION} is running!`); logger.important(`Web UI: http://${config.host}:${config.port}/ui/`); logger.important(`Swagger Docs: http://${config.host}:${config.port}/docs/`); logger.important(`Log level set to: ${config.logLevel}`); diff --git a/src/services/detector.ts b/src/services/detector.ts index ec55cec..dbebd7a 100644 --- a/src/services/detector.ts +++ b/src/services/detector.ts @@ -15,15 +15,69 @@ export interface TextSegment { const DEFAULT_CONFIDENCE_THRESHOLD = 0.5; const MAXIMUM_LANGUAGES_IN_ONE_TEXT = 2; -const MAX_DETECTION_LENGTH = 1024; +const MAX_DETECTION_BYTES = 512; +const MAX_FALLBACK_DETECTION_BYTES = 1024; let cldModule: any = null; let initPromise: Promise | null = null; -function handleCldError(error: any) { +function sanitizeInput(text: string): string { + let sanitized = text.replace(/\0/g, ''); + sanitized = sanitized.replace(/[\x01-\x08\x0B-\x0C\x0E-\x1F\x7F]/g, ''); + return sanitized; +} + +function truncateByUtf8Bytes(text: string, maxBytes: number): string { + const encoder = new TextEncoder(); + const bytes = encoder.encode(text); + + if (bytes.length <= maxBytes) { + return text; + } + + let truncated = bytes.slice(0, maxBytes); + + while (truncated.length > 0) { + try { + return new TextDecoder('utf-8', { fatal: true }).decode(truncated); + } catch { + truncated = truncated.slice(0, -1); + } + } + + return ''; +} + +function validateAndSanitizeInput(text: string, maxBytes: number = MAX_DETECTION_BYTES): string { + if (!text || text.length === 0) { + return text; + } + + const sanitized = sanitizeInput(text); + const truncated = truncateByUtf8Bytes(sanitized, maxBytes); + + if (truncated !== text) { + logger.debug( + `Input sanitized/truncated: ${text.length} → ${truncated.length} chars (limit: ${maxBytes})` + ); + } + + return truncated; +} + +function handleCldError(error: any, context?: { + text?: string; + operation?: string +}) { const errStr = error.toString(); if (errStr.includes('RuntimeError') || errStr.includes('memory access')) { - logger.error(`CLD2 crashed (RuntimeError), resetting module: ${error}`); + logger.error('CLD2 crashed (RuntimeError), resetting module', { + error: errStr, + stack: error.stack, + textLength: context?.text?.length, + textPreview: context?.text?.substring(0, 100), + operation: context?.operation + }); cldModule = null; initPromise = null; } @@ -51,8 +105,13 @@ async function initCLD(): Promise { wasmBinary: wasmBuffer, }); - if (module.LanguageInfo && module.LanguageInfo.prototype && module.LanguageInfo.prototype.detectLanguage) { - module.LanguageInfo.detectLanguage = module.LanguageInfo.prototype.detectLanguage; + if (module.LanguageInfo && module.LanguageInfo.prototype) { + if (module.LanguageInfo.prototype.detectLanguage) { + module.LanguageInfo.detectLanguage = module.LanguageInfo.prototype.detectLanguage; + } + if (module.LanguageInfo.prototype.detectLanguageWithLength) { + module.LanguageInfo.detectLanguageWithLength = module.LanguageInfo.prototype.detectLanguageWithLength; + } } cldModule = module; @@ -66,17 +125,29 @@ async function initCLD(): Promise { return initPromise; } -function detectLanguageWithCLD(text: string, isHTML: boolean = false) { +function detectLanguageWithCLD(text: string, isHTML: boolean = false, maxBytes: number = MAX_DETECTION_BYTES) { if (!cldModule) { throw new Error('CLD2 module not initialized'); } + const validatedText = validateAndSanitizeInput(text, maxBytes); + + if (!validatedText) { + logger.warn('Input validation resulted in empty text'); + return { + language: 'un', + confident: false, + languages: [], + percentScore: 0 + }; + } + const LanguageInfo = cldModule.LanguageInfo; if (!LanguageInfo || !LanguageInfo.detectLanguage) { throw new Error('CLD2 LanguageInfo or detectLanguage not available'); } - const result = LanguageInfo.detectLanguage(text, !isHTML); + const result = LanguageInfo.detectLanguage(validatedText, !isHTML); const languages = Array(3).fill(0).map((_, i) => { const lang = result.get_languages(i); @@ -107,7 +178,7 @@ function bcp47Normalize(code: string): string { } } -export async function detectLanguage(text: string): Promise { +export async function detectLanguage(text: string, maxBytes: number = MAX_DETECTION_BYTES): Promise { if (!text) { return ''; } @@ -115,22 +186,19 @@ export async function detectLanguage(text: string): Promise { await initCLD(); try { - const processText = text.length > MAX_DETECTION_LENGTH - ? text.slice(0, MAX_DETECTION_LENGTH) - : text; - - const result = detectLanguageWithCLD(processText); + const result = detectLanguageWithCLD(text, false, maxBytes); return bcp47Normalize(result.language); } catch (error) { logger.warn(`Language detection failed: ${error}`); - handleCldError(error); + handleCldError(error, { text, operation: 'detectLanguage' }); return 'en'; } } export async function detectLanguageWithConfidence( text: string, - minConfidence: number = DEFAULT_CONFIDENCE_THRESHOLD + minConfidence: number = DEFAULT_CONFIDENCE_THRESHOLD, + maxBytes: number = MAX_DETECTION_BYTES ): Promise<{ language: string; confidence: number }> { if (!text) { return { language: '', confidence: 0 }; @@ -139,11 +207,7 @@ export async function detectLanguageWithConfidence( await initCLD(); try { - const processText = text.length > MAX_DETECTION_LENGTH - ? text.slice(0, MAX_DETECTION_LENGTH) - : text; - - const result = detectLanguageWithCLD(processText); + const result = detectLanguageWithCLD(text, false, maxBytes); const confidence = result.percentScore / 100; if (confidence < minConfidence) { @@ -156,7 +220,7 @@ export async function detectLanguageWithConfidence( }; } catch (error) { logger.warn(`Language detection with confidence failed: ${error}`); - handleCldError(error); + handleCldError(error, { text, operation: 'detectLanguageWithConfidence' }); return { language: 'en', confidence: 0 }; } } @@ -187,6 +251,36 @@ function hasMixedScripts(text: string): boolean { return false; } +function getScriptType(text: string): 'Latin' | 'CJK' | 'Mixed' | 'Other' { + let hasCJK = false; + let hasLatin = false; + + for (const char of text) { + const code = char.charCodeAt(0); + + if ( + (code >= 0x4e00 && code <= 0x9fff) || + (code >= 0x3040 && code <= 0x309f) || + (code >= 0x30a0 && code <= 0x30ff) || + (code >= 0xac00 && code <= 0xd7af) + ) { + hasCJK = true; + } else if ((code >= 0x0041 && code <= 0x005a) || (code >= 0x0061 && code <= 0x007a)) { + hasLatin = true; + } + + if (hasCJK && hasLatin) return 'Mixed'; + } + + if (hasCJK) return 'CJK'; + if (hasLatin) return 'Latin'; + return 'Other'; +} + +function isCJKLanguage(lang: string): boolean { + return ['zh', 'zh-Hans', 'zh-Hant', 'ja', 'ko'].includes(lang) || lang.startsWith('zh-'); +} + export async function detectMultipleLanguages(text: string): Promise { return detectMultipleLanguagesWithThreshold(text, DEFAULT_CONFIDENCE_THRESHOLD); } @@ -201,7 +295,7 @@ export async function detectMultipleLanguagesWithThreshold( await initCLD(); - const fallbackLang = await detectLanguage(text); + const fallbackLang = await detectLanguage(text, MAX_FALLBACK_DETECTION_BYTES); const effectiveFallback = fallbackLang || 'en'; if (!hasMixedScripts(text)) { @@ -220,29 +314,51 @@ export async function detectMultipleLanguagesWithThreshold( const segments: TextSegment[] = []; const segmenterAny = new (Intl as any).Segmenter(undefined, { granularity: 'sentence' }); - const sentenceSegments = Array.from(segmenterAny.segment(text)) as Array<{segment: string, index: number}>; + const sentenceSegments = Array.from(segmenterAny.segment(text)) as Array<{ segment: string, index: number }>; for (const { segment, index } of sentenceSegments) { try { await initCLD(); - const processSegment = segment.length > MAX_DETECTION_LENGTH - ? segment.slice(0, MAX_DETECTION_LENGTH) - : segment; - - const result = detectLanguageWithCLD(processSegment); + const result = detectLanguageWithCLD(segment); const detectedLang = bcp47Normalize(result.language); const confidence = result.percentScore / 100; + const scriptType = getScriptType(segment); + + let finalLang = effectiveFallback; + let usedLogic = 'fallback'; + + if (confidence >= threshold) { + finalLang = detectedLang; + usedLogic = 'confidence'; + } else { + if (scriptType === 'Latin' && isCJKLanguage(effectiveFallback)) { + if (detectedLang && detectedLang !== 'un') { + finalLang = detectedLang; + usedLogic = 'script-override-latin'; + } else { + finalLang = 'en'; + usedLogic = 'script-override-en'; + } + } else if (scriptType === 'CJK' && !isCJKLanguage(effectiveFallback)) { + if (detectedLang && detectedLang !== 'un') { + finalLang = detectedLang; + usedLogic = 'script-override-cjk'; + } + } + } + + logger.debug(`Segment[${segments.length}]: "${segment.replace(/\n/g, '\\n')}" -> lang=${detectedLang}, conf=${confidence.toFixed(2)}, script=${scriptType}, final=${finalLang} (${usedLogic})`); segments.push({ text: segment, - language: confidence >= threshold ? detectedLang : effectiveFallback, + language: finalLang, start: index, end: index + segment.length, confidence }); } catch (error) { logger.warn(`Failed to detect language for segment: ${error}`); - handleCldError(error); + handleCldError(error, { text: segment, operation: 'detectMultipleLanguages' }); segments.push({ text: segment, language: effectiveFallback, diff --git a/tests/cld-fix-validation.test.ts b/tests/cld-fix-validation.test.ts new file mode 100644 index 0000000..86bbed7 --- /dev/null +++ b/tests/cld-fix-validation.test.ts @@ -0,0 +1,53 @@ +import { describe, test, expect } from 'bun:test'; +import { detectLanguage } from '@/services/detector'; + +describe('CLD2 Memory Safety Tests', () => { + test('包含 null 字节的字符串', async () => { + const text = 'Hello\0World'; + const result = await detectLanguage(text); + expect(result).toBeDefined(); + expect(typeof result).toBe('string'); + }); + + test('超长文本(1MB)', async () => { + const text = 'A'.repeat(1024 * 1024); + const result = await detectLanguage(text); + expect(result).toBeDefined(); + }); + + test('混合 UTF-8 多字节字符', async () => { + const text = '你好世界🌍Hello'.repeat(1000); + const result = await detectLanguage(text); + expect(result).toBeDefined(); + }); + + test('控制字符', async () => { + const text = 'Test\x01\x02\x03Text'; + const result = await detectLanguage(text); + expect(result).toBeDefined(); + }); + + test('连续多次检测不崩溃', async () => { + for (let i = 0; i < 100; i++) { + const text = `Test ${i} with special chars 你好\0\x01`; + await detectLanguage(text); + } + }); + + test('空文本', async () => { + const result = await detectLanguage(''); + expect(result).toBe(''); + }); + + test('纯空白字符', async () => { + const text = ' \n\t '; + const result = await detectLanguage(text); + expect(result).toBeDefined(); + }); + + test('emoji 表情符号', async () => { + const text = '🎉🎊🎈🎁🎀'; + const result = await detectLanguage(text); + expect(result).toBeDefined(); + }); +});