diff --git a/analyzeMapping.js b/analyzeMapping.js new file mode 100644 index 0000000..a76dce6 --- /dev/null +++ b/analyzeMapping.js @@ -0,0 +1,84 @@ +const diacritics = require("./index.js"); + +function getAllCoveredCharacters(replacementList) { + const allChars = new Set(); + replacementList.forEach((item) => { + for (let char of item.chars) { + allChars.add(char); + } + }); + return allChars; +} + +function getLatinUnicodeRanges() { + return [ + { name: "Latin-1 Supplement", start: 0x0080, end: 0x00ff }, + { name: "Latin Extended-A", start: 0x0100, end: 0x017f }, + { name: "Latin Extended-B", start: 0x0180, end: 0x024f }, + { name: "Latin Extended Additional", start: 0x1e00, end: 0x1eff }, + { name: "Latin Extended-C", start: 0x2c60, end: 0x2c7f }, + { name: "Latin Extended-D", start: 0xa720, end: 0xa7ff }, + ]; +} + +function isLetter(char) { + return /\p{Letter}/u.test(char); +} + +function analyzeRangeCoverage(range, coveredChars) { + let total = 0; + let covered = 0; + let missing = []; + + for (let code = range.start; code <= range.end; code++) { + const char = String.fromCharCode(code); + if (isLetter(char)) { + total++; + if (coveredChars.has(char)) { + covered++; + } else { + missing.push( + `U+${code.toString(16).toUpperCase().padStart(4, "0")} (${char})` + ); + } + } + } + + return { total, covered, missing }; +} + +function formatMissingCharacters(missing) { + if (missing.length === 0) { + return null; + } + + if (missing.length <= 10) { + return missing.join(", "); + } + + return `${missing.slice(0, 5).join(", ")} ... and ${missing.length - 5} more`; +} + +function displayRangeCoverage(range, analysis) { + const percentage = Math.round((analysis.covered / analysis.total) * 100); + console.log(`\n${range.name}: ${analysis.covered}/${analysis.total} (${percentage}%)`); + + const missingFormatted = formatMissingCharacters(analysis.missing); + if (missingFormatted) { + console.log("Missing:", missingFormatted); + } +} + +function checkCoverage() { + const allChars = getAllCoveredCharacters(diacritics.replacementList); + console.log("Total characters covered:", allChars.size); + + const latinRanges = getLatinUnicodeRanges(); + + latinRanges.forEach((range) => { + const analysis = analyzeRangeCoverage(range, allChars); + displayRangeCoverage(range, analysis); + }); +} + +checkCoverage(); diff --git a/benchmark.js b/benchmark.js new file mode 100644 index 0000000..b017074 --- /dev/null +++ b/benchmark.js @@ -0,0 +1,235 @@ +const { remove } = require('./index.js'); + + +const logHeader = (title, newLine = false) => { + logSeparator(newLine); + console.log(title); + logSeparator(); +} + +const logSeparator = (newLine = false) => { + console.log( + (newLine ? '\n' : '') + '='.repeat(70) + ) +}; + +const WARMUP_ITERATIONS = 1000; + +function nanoToMilli(nano) { + return Number(nano) / 1_000_000; +} + +function warmupFunction(fn, iterations = WARMUP_ITERATIONS) { + for (let i = 0; i < iterations; i++) { + fn(); + } +} + +function measurePerformance(fn, iterations) { + const start = process.hrtime.bigint(); + for (let i = 0; i < iterations; i++) { + fn(); + } + const end = process.hrtime.bigint(); + return nanoToMilli(end - start); +} + +function calculateMetrics(totalTime, iterations) { + const avgTime = totalTime / iterations; + const opsPerSecond = Math.round(1000 / avgTime); + return { totalTime, avgTime, opsPerSecond }; +} + +function displayResults({ totalTime, avgTime, opsPerSecond }) { + console.log(`Total time: ${totalTime.toFixed(2)}ms`); + console.log(`Average time per operation: ${avgTime.toFixed(6)}ms`); + console.log(`Operations per second: ${opsPerSecond.toLocaleString()}`); +} + +function benchmark(name, fn, iterations = 50000) { + logHeader(name, true); + + warmupFunction(fn); + const totalTime = measurePerformance(fn, iterations); + const metrics = calculateMetrics(totalTime, iterations); + + displayResults(metrics); + return metrics; +} + +const createLongParagraph = () => ` + Łorem ipsum dolor sit amet, cōnsectetuer adipīscing elit. + Maecenās porttitor congue massa. Fusce posuere, magna sed + pulvinar ultricies, purus lectus malesuada libero, sit amet + commodo magna eros quis urna. Nunc viverra imperdiet enim. + Fusce est. Vivamus a tellus. Pellentesque habitant morbi + tristique senectus et netus et malesuada fames ac turpis + egestas. Proin pharetra nonummy pede. Mauris et orci. + Aenean nec lorem. In porttitor. Donec laoreet nonummy augue. +`.replace(/\s+/g, ' ').trim(); + +const createVeryLongText = () => ` + Ñoñó, eñ el año mil ñovecieñtos ñoveñta y ñueve, eñ el pueblo + de Añañuca, vivía uña ñiña llamada Begoña. Begoña teñía uñ + sueño: coñvertirse eñ la mejor diseñadora de España. Cada + mañaña se levaña tempraño y se poñía a dibujar coñ mucho + empeño. Su papá, doñ Toño, y su mamá, doña Coñcepció, la + apoyadaƅ eñ todo. Uñ día, mieñtras camińaba por el señdero + del cañó, eñcoñtró uñas piedras muy extrañas coñ símƅolos + aǹtiguos grabados. Estas piedras teńíaň poderes mágicos que + podíaň hacer realidad cualquier sueño. Begoña tomó las piedras + y pidió su deseo coñ mucha fe. Al día siguieñte, recibió uña + carta de uña uñiversidad prestigiosa de París que la iñvitaba + a estudiar diseño. Así fue como Begoña cumplió su sueño gracias + a su dedicació y a la magia de aqellas piedras eñcañtadas. +`.repeat(10).replace(/\s+/g, ' ').trim(); + +const shortStrings = { + simple: "café", + basicAccents: "résumé naïve" +}; + +const mediumStrings = { + sentence: "The quick brown fox jumps over the lazy dog with café and résumé", + international: "Iлtèrnåtïonɑlíƶatï߀ԉ", + mixed: "Zürich München Köln François José María" +}; + +const longStrings = { + paragraph: createLongParagraph(), + longText: createVeryLongText() +}; + +const edgeCases = { + empty: "", + noAccents: "Hello World 123", + onlyAccents: "àáâãäåæçèéêëìíîïñòóôõöøùúûüý", + numbers: "123 456 789", + specialChars: "!@#$%^&*()_+-=[]{}|;':\",./<>?" +}; + +const unicodeEdgeCases = { + emoji: "Hello 👋 world 🌍 with café ☕", + mixedScript: "Hello мир café 世界 السلام עולם" +}; + +const realWorldExamples = { + names: "José María García-González François Müller Søren Østergård", + cities: "São Paulo München Zürich Kraków Москва", + words: "naïve résumé fiancé café piñata jalapeño" +}; + +const testData = { + ...shortStrings, + ...mediumStrings, + ...longStrings, + ...edgeCases, + ...unicodeEdgeCases, + ...realWorldExamples +}; + +const DENSITY_TEST_ITERATIONS = 75000; +const MEMORY_TEST_ITERATIONS = 10000; +const TOP_PERFORMERS_COUNT = 5; +const SAMPLE_KEYS = ['simple', 'international', 'mixed', 'cities']; +const MAX_DISPLAY_LENGTH = 50; +const TRUNCATE_LENGTH = 47; + +const densityTests = { + '0% accents': 'Hello World Test String', + '25% accents': 'Héllo Wórld Tést Stríng', + '50% accents': 'Héllö Wórld Tést Strïng', + '75% accents': 'Héllö Wörlđ Tést Strïñg', + '100% accents': 'Héllö Wörlđ Tést Strïñğ' +}; + +function runMainBenchmarks() { + const results = []; + Object.entries(testData).forEach(([name, text]) => { + const description = `${name} (${text.length} chars)`; + const result = benchmark(description, () => remove(text)); + results.push({ name, length: text.length, ...result }); + }); + return results; +} + +function runDensityBenchmarks() { + Object.entries(densityTests).forEach(([density, text]) => { + benchmark(density, () => remove(text), DENSITY_TEST_ITERATIONS); + }); +} + +function runMemoryTest() { + const memBefore = process.memoryUsage(); + for (let i = 0; i < MEMORY_TEST_ITERATIONS; i++) { + remove(testData.longText); + } + const memAfter = process.memoryUsage(); + return { memBefore, memAfter }; +} + +function displayMemoryUsage({ memBefore, memAfter }) { + const rssDiff = (memAfter.rss - memBefore.rss) / 1024 / 1024; + const heapDiff = (memAfter.heapUsed - memBefore.heapUsed) / 1024 / 1024; + console.log(`RSS: ${rssDiff.toFixed(2)} MB`); + console.log(`Heap Used: ${heapDiff.toFixed(2)} MB`); +} + +function getTopPerformers(results) { + return results.sort((a, b) => b.opsPerSecond - a.opsPerSecond); +} + +function displayTopPerformers(sortedResults) { + console.log('\nTop performers by ops/second:'); + sortedResults.slice(0, TOP_PERFORMERS_COUNT).forEach((result, i) => { + console.log(`${i + 1}. ${result.name}: ${result.opsPerSecond.toLocaleString()} ops/sec`); + }); +} + +function truncateText(text, maxLength = MAX_DISPLAY_LENGTH, truncateAt = TRUNCATE_LENGTH) { + return text.length > maxLength ? text.substring(0, truncateAt) + '...' : text; +} + +function displaySampleOutputs() { + console.log('\nSample outputs:'); + SAMPLE_KEYS.forEach(key => { + if (testData[key]) { + const input = truncateText(testData[key]); + const output = remove(testData[key]); + const outputDisplay = truncateText(output); + console.log(`${key}: "${input}" → "${outputDisplay}"`); + } + }); +} + +function getTotalMappableCharacters() { + const { replacementList } = require('./index.js'); + return replacementList.reduce((sum, item) => sum + item.chars.length, 0); +} + +function displayCharacterInfo() { + const totalChars = getTotalMappableCharacters(); + console.log(`Total mappable characters: ${totalChars}`); +} + +function runBenchmarkSuite() { + logHeader('DIACRITICS REMOVAL BENCHMARK'); + const results = runMainBenchmarks(); + + logHeader('PERFORMANCE BY ACCENT DENSITY', true); + runDensityBenchmarks(); + + logHeader('MEMORY USAGE TEST', true); + const memoryResults = runMemoryTest(); + displayMemoryUsage(memoryResults); + + logHeader('PERFORMANCE SUMMARY', true); + displayCharacterInfo(); + const sortedResults = getTopPerformers(results); + displayTopPerformers(sortedResults); + displaySampleOutputs(); + + logHeader('BENCHMARK COMPLETE', true); +} + +runBenchmarkSuite(); diff --git a/index.js b/index.js index df607cb..5525505 100644 --- a/index.js +++ b/index.js @@ -306,8 +306,58 @@ for (var i = 0; i < replacementList.length; i += 1) { } } +// Unicode ranges for characters that have diacritics mappings +// Each range represents a specific Unicode block with actual character mappings +var basicLatinRange = '\\u0043'; // Single 'C' character +var latin1SupplementRange = '\\u0080-\\u00FF'; // Latin-1 Supplement (accented letters) +var latinExtendedARange = '\\u0100-\\u017F'; // Latin Extended-A +var latinExtendedBRange = '\\u0180-\\u024F'; // Latin Extended-B +var ipaExtensionsRange = '\\u0250-\\u02AF'; // IPA Extensions (phonetic symbols) +var greekCopticRange = '\\u0370-\\u03FF'; // Greek and Coptic +var cyrillicRange = '\\u0400-\\u04FF'; // Cyrillic +var cyrillicExtendedBRange = '\\u0500-\\u052F'; // Cyrillic Extended-B +var nkoRange = '\\u07C0'; // NKo (single character) +var cherokeeRange = '\\u13A0-\\u13FF'; // Cherokee +var phoneticExtensionsRange = '\\u1D00-\\u1D7F'; // Phonetic Extensions +var latinExtendedAdditionalRange = '\\u1E00-\\u1EFF'; // Latin Extended Additional +var generalPunctuationRange = '\\u2180-\\u2189'; // General Punctuation (subset) +var enclosedAlphanumericsRange = '\\u24B6-\\u24E9'; // Enclosed Alphanumerics (circled letters) +var latinExtendedCRange = '\\u2C60-\\u2C7F'; // Latin Extended-C +var latinExtendedDRange = '\\uA720-\\uA7FF'; // Latin Extended-D +var alphabeticPresentationFormsRange = '\\uFB00-\\uFB4F'; // Alphabetic Presentation Forms +var halfwidthFullwidthFormsRange = '\\uFF00-\\uFFEF'; // Halfwidth and Fullwidth Forms + +// Combine all ranges into a single character class pattern +var diacriticsPattern = new RegExp( + '[' + + basicLatinRange + + latin1SupplementRange + + latinExtendedARange + + latinExtendedBRange + + ipaExtensionsRange + + greekCopticRange + + cyrillicRange + + cyrillicExtendedBRange + + nkoRange + + cherokeeRange + + phoneticExtensionsRange + + latinExtendedAdditionalRange + + generalPunctuationRange + + enclosedAlphanumericsRange + + latinExtendedCRange + + latinExtendedDRange + + alphabeticPresentationFormsRange + + halfwidthFullwidthFormsRange + + ']', + 'g' +); + +function replaceDiacritic(c) { + return diacriticsMap[c] || c; +} + function removeDiacritics(str) { - return str.replace(/[^\u0000-\u007e]/g, function(c) { + return str.replace(diacriticsPattern, function(c) { return diacriticsMap[c] || c; }); } diff --git a/package.json b/package.json index bd6f057..cd8e69b 100644 --- a/package.json +++ b/package.json @@ -4,7 +4,9 @@ "description": "remove diacritics from strings", "main": "index.js", "scripts": { - "test": "node test/test.js" + "test": "node test/test.js", + "benchmark": "node benchmark.js", + "analyze": "node analyzeMapping.js" }, "repository": { "type": "git", @@ -24,5 +26,10 @@ }, "directories": { "test": "test" - } + }, + "files": [ + "index.js", + "LICENSE", + "README.md" + ] } diff --git a/test/test.js b/test/test.js index 637f31b..766a51e 100644 --- a/test/test.js +++ b/test/test.js @@ -1,6 +1,14 @@ -var removeDiacritics = require('../').remove, +var diacritics = require('../'), + removeDiacritics = diacritics.remove, assert = require('assert'); +// Test module exports +assert.strictEqual(typeof diacritics.remove, 'function', 'remove should be exported as a function'); +assert.strictEqual(typeof diacritics.replacementList, 'object', 'replacementList should be exported'); +assert.strictEqual(typeof diacritics.diacriticsMap, 'object', 'diacriticsMap should be exported'); +assert.strictEqual(Array.isArray(diacritics.replacementList), true, 'replacementList should be an array'); + +// Original comprehensive tests assert.strictEqual(removeDiacritics("Iлtèrnåtïonɑlíƶatï߀ԉ"), "Internationalizati0n"); assert.strictEqual(removeDiacritics("Båcòл ípѕùm ðoɭ߀r ѕït aϻèt âùþê aԉᏧ߀üïlɭê ƃëéf culρá fïlèt ϻiǥnòn cuρiᏧatat ut êлim tòлɢùê."), @@ -12,3 +20,60 @@ assert.strictEqual(removeDiacritics("hŒllœ"), "hOElloe"); assert.strictEqual(removeDiacritics("Fußball"), "Fussball"); assert.strictEqual(removeDiacritics("ABCDEFGHIJKLMNOPQRSTUVWXYZé"), "ABCDEFGHIJKLMNOPQRSTUVWXYZe"); + +// Edge cases +assert.strictEqual(removeDiacritics(""), "", "Empty string should return empty string"); +assert.strictEqual(removeDiacritics("Hello World"), "Hello World", "ASCII-only string should remain unchanged"); +assert.strictEqual(removeDiacritics("123456789"), "123456789", "Numbers should remain unchanged"); +assert.strictEqual(removeDiacritics("!@#$%^&*()"), "!@#$%^&*()", "Special characters should remain unchanged"); + +// Mixed content tests (important for optimization validation) +assert.strictEqual(removeDiacritics("café"), "cafe", "Simple accented word"); +assert.strictEqual(removeDiacritics("résumé"), "resume", "Multiple accents in one word"); +assert.strictEqual(removeDiacritics("naïve"), "naive", "Diaeresis/umlaut"); +assert.strictEqual(removeDiacritics("piñata"), "pinata", "Tilde"); + +// Unicode range edge cases +assert.strictEqual(removeDiacritics("Ÿ"), "Y", "Latin-1 Supplement range"); +assert.strictEqual(removeDiacritics("Ā"), "A", "Latin Extended-A range"); +assert.strictEqual(removeDiacritics("ƀ"), "b", "Latin Extended-B range"); + +// Test ASCII preservation (optimization benefit verification) +assert.strictEqual(removeDiacritics("The quick brown fox jumps over the lazy dog"), + "The quick brown fox jumps over the lazy dog", "Long ASCII string should be unchanged"); + +// Mixed ASCII and diacritics (common real-world scenario) +assert.strictEqual(removeDiacritics("José lives in São Paulo"), "Jose lives in Sao Paulo"); +assert.strictEqual(removeDiacritics("François went to Zürich"), "Francois went to Zurich"); +assert.strictEqual(removeDiacritics("Zürich"), "Zurich", "German umlaut"); + +// Unicode range optimization validation (CJK, Arabic should remain unchanged) +assert.strictEqual(removeDiacritics("你好世界"), "你好世界", "Chinese characters should remain unchanged"); +assert.strictEqual(removeDiacritics("مرحبا"), "مرحبا", "Arabic text should remain unchanged"); +assert.strictEqual(removeDiacritics("こんにちは"), "こんにちは", "Japanese Hiragana should remain unchanged"); +assert.strictEqual(removeDiacritics("안녕하세요"), "안녕하세요", "Korean text should remain unchanged"); + +// Mixed content with non-Latin scripts (verify selective processing) +assert.strictEqual(removeDiacritics("café 咖啡"), "cafe 咖啡", "Latin diacritics processed, CJK preserved"); +assert.strictEqual(removeDiacritics("résumé قهوة"), "resume قهوة", "Latin diacritics processed, Arabic preserved"); + +// Test whitespace and formatting preservation +assert.strictEqual(removeDiacritics(" café "), " cafe ", "Leading/trailing spaces preserved"); +assert.strictEqual(removeDiacritics("line1\nliné2"), "line1\nline2", "Newlines preserved"); +assert.strictEqual(removeDiacritics("tab\taccént"), "tab\taccent", "Tabs preserved"); + +// Test diacriticsMap consistency +var testChar = "é"; +var expectedBase = "e"; +assert.strictEqual(diacritics.diacriticsMap[testChar], expectedBase, "diacriticsMap should contain mapping for é"); +assert.strictEqual(removeDiacritics(testChar), expectedBase, "removeDiacritics should use diacriticsMap"); + +// Verify replacementList structure +assert.strictEqual(diacritics.replacementList.length > 0, true, "replacementList should not be empty"); +diacritics.replacementList.forEach(function(item, index) { + assert.strictEqual(typeof item.base, 'string', 'replacementList[' + index + '].base should be string'); + assert.strictEqual(typeof item.chars, 'string', 'replacementList[' + index + '].chars should be string'); + assert.strictEqual(item.chars.length > 0, true, 'replacementList[' + index + '].chars should not be empty'); +}); + +console.log("All tests passed! ✅");