How can I replace diacritics (ă,ş,ţ etc) with their “normal” form (a,s,t) in javascript?
If you want to do it entirely on the client side, I think your only option is with some kind of lookup table. Here’s a starting point, written by a chap called Olavi Ivask on his blog…
function replaceDiacritics(s)
{
var s;
var diacritics =[
/[300-306]/g, /[340-346]/g, // A, a
/[310-313]/g, /[350-353]/g, // E, e
/[314-317]/g, /[354-357]/g, // I, i
/[322-330]/g, /[362-370]/g, // O, o
/[331-334]/g, /[371-374]/g, // U, u
/[321]/g, /[361]/g, // N, n
/[307]/g, /[347]/g, // C, c
];
var chars = ['A','a','E','e','I','i','O','o','U','u','N','n','C','c'];
for (var i = 0; i < diacritics.length; i++)
{
s = s.replace(diacritics[i],chars[i]);
}
document.write(s);
}
You can see this is simply an array of regexes for known diacritic chars, mapping them back onto a “plain” character.
In modern browsers and node.js you can use unicode normalization to decompose those characters followed by a filtering regex.
str.normalize('NFKD').replace(/[^w]/g, '')
If you wanted to allow characters such as whitespaces, dashes, etc. you should extend the regex to allow them.
str.normalize('NFKD').replace(/[^ws.-_/]/g, '')
var str="áàâäãéèëêíìïîóòöôõúùüûñçăşţ";
var asciiStr = str.normalize('NFKD').replace(/[^w]/g, '');
console.info(str, asciiStr);
NOTES: This method does not work with characters that do not have unicode composed varian. i.e. ø
and ł
A more complete version with case sensitive support, ligatures and whatnot.
Original source at: http://lehelk.com/2011/05/06/script-to-remove-diacritics/
var defaultDiacriticsRemovalMap = [
{'base':'A', 'letters':/[u0041u24B6uFF21u00C0u00C1u00C2u1EA6u1EA4u1EAAu1EA8u00C3u0100u0102u1EB0u1EAEu1EB4u1EB2u0226u01E0u00C4u01DEu1EA2u00C5u01FAu01CDu0200u0202u1EA0u1EACu1EB6u1E00u0104u023Au2C6F]/g},
{'base':'AA','letters':/[uA732]/g},
{'base':'AE','letters':/[u00C6u01FCu01E2]/g},
{'base':'AO','letters':/[uA734]/g},
{'base':'AU','letters':/[uA736]/g},
{'base':'AV','letters':/[uA738uA73A]/g},
{'base':'AY','letters':/[uA73C]/g},
{'base':'B', 'letters':/[u0042u24B7uFF22u1E02u1E04u1E06u0243u0182u0181]/g},
{'base':'C', 'letters':/[u0043u24B8uFF23u0106u0108u010Au010Cu00C7u1E08u0187u023BuA73E]/g},
{'base':'D', 'letters':/[u0044u24B9uFF24u1E0Au010Eu1E0Cu1E10u1E12u1E0Eu0110u018Bu018Au0189uA779]/g},
{'base':'DZ','letters':/[u01F1u01C4]/g},
{'base':'Dz','letters':/[u01F2u01C5]/g},
{'base':'E', 'letters':/[u0045u24BAuFF25u00C8u00C9u00CAu1EC0u1EBEu1EC4u1EC2u1EBCu0112u1E14u1E16u0114u0116u00CBu1EBAu011Au0204u0206u1EB8u1EC6u0228u1E1Cu0118u1E18u1E1Au0190u018E]/g},
{'base':'F', 'letters':/[u0046u24BBuFF26u1E1Eu0191uA77B]/g},
{'base':'G', 'letters':/[u0047u24BCuFF27u01F4u011Cu1E20u011Eu0120u01E6u0122u01E4u0193uA7A0uA77DuA77E]/g},
{'base':'H', 'letters':/[u0048u24BDuFF28u0124u1E22u1E26u021Eu1E24u1E28u1E2Au0126u2C67u2C75uA78D]/g},
{'base':'I', 'letters':/[u0049u24BEuFF29u00CCu00CDu00CEu0128u012Au012Cu0130u00CFu1E2Eu1EC8u01CFu0208u020Au1ECAu012Eu1E2Cu0197]/g},
{'base':'J', 'letters':/[u004Au24BFuFF2Au0134u0248]/g},
{'base':'K', 'letters':/[u004Bu24C0uFF2Bu1E30u01E8u1E32u0136u1E34u0198u2C69uA740uA742uA744uA7A2]/g},
{'base':'L', 'letters':/[u004Cu24C1uFF2Cu013Fu0139u013Du1E36u1E38u013Bu1E3Cu1E3Au0141u023Du2C62u2C60uA748uA746uA780]/g},
{'base':'LJ','letters':/[u01C7]/g},
{'base':'Lj','letters':/[u01C8]/g},
{'base':'M', 'letters':/[u004Du24C2uFF2Du1E3Eu1E40u1E42u2C6Eu019C]/g},
{'base':'N', 'letters':/[u004Eu24C3uFF2Eu01F8u0143u00D1u1E44u0147u1E46u0145u1E4Au1E48u0220u019DuA790uA7A4]/g},
{'base':'NJ','letters':/[u01CA]/g},
{'base':'Nj','letters':/[u01CB]/g},
{'base':'O', 'letters':/[u004Fu24C4uFF2Fu00D2u00D3u00D4u1ED2u1ED0u1ED6u1ED4u00D5u1E4Cu022Cu1E4Eu014Cu1E50u1E52u014Eu022Eu0230u00D6u022Au1ECEu0150u01D1u020Cu020Eu01A0u1EDCu1EDAu1EE0u1EDEu1EE2u1ECCu1ED8u01EAu01ECu00D8u01FEu0186u019FuA74AuA74C]/g},
{'base':'OI','letters':/[u01A2]/g},
{'base':'OO','letters':/[uA74E]/g},
{'base':'OU','letters':/[u0222]/g},
{'base':'P', 'letters':/[u0050u24C5uFF30u1E54u1E56u01A4u2C63uA750uA752uA754]/g},
{'base':'Q', 'letters':/[u0051u24C6uFF31uA756uA758u024A]/g},
{'base':'R', 'letters':/[u0052u24C7uFF32u0154u1E58u0158u0210u0212u1E5Au1E5Cu0156u1E5Eu024Cu2C64uA75AuA7A6uA782]/g},
{'base':'S', 'letters':/[u0053u24C8uFF33u1E9Eu015Au1E64u015Cu1E60u0160u1E66u1E62u1E68u0218u015Eu2C7EuA7A8uA784]/g},
{'base':'T', 'letters':/[u0054u24C9uFF34u1E6Au0164u1E6Cu021Au0162u1E70u1E6Eu0166u01ACu01AEu023EuA786]/g},
{'base':'TZ','letters':/[uA728]/g},
{'base':'U', 'letters':/[u0055u24CAuFF35u00D9u00DAu00DBu0168u1E78u016Au1E7Au016Cu00DCu01DBu01D7u01D5u01D9u1EE6u016Eu0170u01D3u0214u0216u01AFu1EEAu1EE8u1EEEu1EECu1EF0u1EE4u1E72u0172u1E76u1E74u0244]/g},
{'base':'V', 'letters':/[u0056u24CBuFF36u1E7Cu1E7Eu01B2uA75Eu0245]/g},
{'base':'VY','letters':/[uA760]/g},
{'base':'W', 'letters':/[u0057u24CCuFF37u1E80u1E82u0174u1E86u1E84u1E88u2C72]/g},
{'base':'X', 'letters':/[u0058u24CDuFF38u1E8Au1E8C]/g},
{'base':'Y', 'letters':/[u0059u24CEuFF39u1EF2u00DDu0176u1EF8u0232u1E8Eu0178u1EF6u1EF4u01B3u024Eu1EFE]/g},
{'base':'Z', 'letters':/[u005Au24CFuFF3Au0179u1E90u017Bu017Du1E92u1E94u01B5u0224u2C7Fu2C6BuA762]/g},
{'base':'a', 'letters':/[u0061u24D0uFF41u1E9Au00E0u00E1u00E2u1EA7u1EA5u1EABu1EA9u00E3u0101u0103u1EB1u1EAFu1EB5u1EB3u0227u01E1u00E4u01DFu1EA3u00E5u01FBu01CEu0201u0203u1EA1u1EADu1EB7u1E01u0105u2C65u0250]/g},
{'base':'aa','letters':/[uA733]/g},
{'base':'ae','letters':/[u00E6u01FDu01E3]/g},
{'base':'ao','letters':/[uA735]/g},
{'base':'au','letters':/[uA737]/g},
{'base':'av','letters':/[uA739uA73B]/g},
{'base':'ay','letters':/[uA73D]/g},
{'base':'b', 'letters':/[u0062u24D1uFF42u1E03u1E05u1E07u0180u0183u0253]/g},
{'base':'c', 'letters':/[u0063u24D2uFF43u0107u0109u010Bu010Du00E7u1E09u0188u023CuA73Fu2184]/g},
{'base':'d', 'letters':/[u0064u24D3uFF44u1E0Bu010Fu1E0Du1E11u1E13u1E0Fu0111u018Cu0256u0257uA77A]/g},
{'base':'dz','letters':/[u01F3u01C6]/g},
{'base':'e', 'letters':/[u0065u24D4uFF45u00E8u00E9u00EAu1EC1u1EBFu1EC5u1EC3u1EBDu0113u1E15u1E17u0115u0117u00EBu1EBBu011Bu0205u0207u1EB9u1EC7u0229u1E1Du0119u1E19u1E1Bu0247u025Bu01DD]/g},
{'base':'f', 'letters':/[u0066u24D5uFF46u1E1Fu0192uA77C]/g},
{'base':'g', 'letters':/[u0067u24D6uFF47u01F5u011Du1E21u011Fu0121u01E7u0123u01E5u0260uA7A1u1D79uA77F]/g},
{'base':'h', 'letters':/[u0068u24D7uFF48u0125u1E23u1E27u021Fu1E25u1E29u1E2Bu1E96u0127u2C68u2C76u0265]/g},
{'base':'hv','letters':/[u0195]/g},
{'base':'i', 'letters':/[u0069u24D8uFF49u00ECu00EDu00EEu0129u012Bu012Du00EFu1E2Fu1EC9u01D0u0209u020Bu1ECBu012Fu1E2Du0268u0131]/g},
{'base':'j', 'letters':/[u006Au24D9uFF4Au0135u01F0u0249]/g},
{'base':'k', 'letters':/[u006Bu24DAuFF4Bu1E31u01E9u1E33u0137u1E35u0199u2C6AuA741uA743uA745uA7A3]/g},
{'base':'l', 'letters':/[u006Cu24DBuFF4Cu0140u013Au013Eu1E37u1E39u013Cu1E3Du1E3Bu017Fu0142u019Au026Bu2C61uA749uA781uA747]/g},
{'base':'lj','letters':/[u01C9]/g},
{'base':'m', 'letters':/[u006Du24DCuFF4Du1E3Fu1E41u1E43u0271u026F]/g},
{'base':'n', 'letters':/[u006Eu24DDuFF4Eu01F9u0144u00F1u1E45u0148u1E47u0146u1E4Bu1E49u019Eu0272u0149uA791uA7A5]/g},
{'base':'nj','letters':/[u01CC]/g},
{'base':'o', 'letters':/[u006Fu24DEuFF4Fu00F2u00F3u00F4u1ED3u1ED1u1ED7u1ED5u00F5u1E4Du022Du1E4Fu014Du1E51u1E53u014Fu022Fu0231u00F6u022Bu1ECFu0151u01D2u020Du020Fu01A1u1EDDu1EDBu1EE1u1EDFu1EE3u1ECDu1ED9u01EBu01EDu00F8u01FFu0254uA74BuA74Du0275]/g},
{'base':'oi','letters':/[u01A3]/g},
{'base':'ou','letters':/[u0223]/g},
{'base':'oo','letters':/[uA74F]/g},
{'base':'p','letters':/[u0070u24DFuFF50u1E55u1E57u01A5u1D7DuA751uA753uA755]/g},
{'base':'q','letters':/[u0071u24E0uFF51u024BuA757uA759]/g},
{'base':'r','letters':/[u0072u24E1uFF52u0155u1E59u0159u0211u0213u1E5Bu1E5Du0157u1E5Fu024Du027DuA75BuA7A7uA783]/g},
{'base':'s','letters':/[u0073u24E2uFF53u00DFu015Bu1E65u015Du1E61u0161u1E67u1E63u1E69u0219u015Fu023FuA7A9uA785u1E9B]/g},
{'base':'t','letters':/[u0074u24E3uFF54u1E6Bu1E97u0165u1E6Du021Bu0163u1E71u1E6Fu0167u01ADu0288u2C66uA787]/g},
{'base':'tz','letters':/[uA729]/g},
{'base':'u','letters':/[u0075u24E4uFF55u00F9u00FAu00FBu0169u1E79u016Bu1E7Bu016Du00FCu01DCu01D8u01D6u01DAu1EE7u016Fu0171u01D4u0215u0217u01B0u1EEBu1EE9u1EEFu1EEDu1EF1u1EE5u1E73u0173u1E77u1E75u0289]/g},
{'base':'v','letters':/[u0076u24E5uFF56u1E7Du1E7Fu028BuA75Fu028C]/g},
{'base':'vy','letters':/[uA761]/g},
{'base':'w','letters':/[u0077u24E6uFF57u1E81u1E83u0175u1E87u1E85u1E98u1E89u2C73]/g},
{'base':'x','letters':/[u0078u24E7uFF58u1E8Bu1E8D]/g},
{'base':'y','letters':/[u0079u24E8uFF59u1EF3u00FDu0177u1EF9u0233u1E8Fu00FFu1EF7u1E99u1EF5u01B4u024Fu1EFF]/g},
{'base':'z','letters':/[u007Au24E9uFF5Au017Au1E91u017Cu017Eu1E93u1E95u01B6u0225u0240u2C6CuA763]/g}
];
var changes;
function removeDiacritics (str) {
if(!changes) {
changes = defaultDiacriticsRemovalMap;
}
for(var i=0; i<changes.length; i++) {
str = str.replace(changes[i].letters, changes[i].base);
}
return str;
}
A simple modification to the script of Paul. Extend the String-object
String.prototype.removeDiacritics = function() {
var diacritics = [
[/[300-306]/g, 'A'],
[/[340-346]/g, 'a'],
[/[310-313]/g, 'E'],
[/[350-353]/g, 'e'],
[/[314-317]/g, 'I'],
[/[354-357]/g, 'i'],
[/[322-330]/g, 'O'],
[/[362-370]/g, 'o'],
[/[331-334]/g, 'U'],
[/[371-374]/g, 'u'],
[/[321]/g, 'N'],
[/[361]/g, 'n'],
[/[307]/g, 'C'],
[/[347]/g, 'c'],
];
var s = this;
for (var i = 0; i < diacritics.length; i++) {
s = s.replace(diacritics[i][0], diacritics[i][1]);
}
return s;
}
Now you can do:
var wrongString = "hëllô";
alert(wrongString.removeDiacritics()); // alerts "hello"
I have ported the Apache Lucene ASCII Folding Filter to JavaScript. You can replace a lot of Unicode characters (including diacritics) to ASCII base forms.
You can find the port on https://github.com/mplatt/fold-to-ascii-js
After integrating the library you could fold strings like that:
//foldToASCII("ăşţ"); -> depreciated
Edit
What I did to get this to work was follow the install instructions in his link then import the lib, and then I could use it.
import ASCIIFolder from fold-to-ascii
//Then I can call the following:
ASCIIFolder.foldReplacing("Lörem 🤧 ëripuît") === "Lorem eripuit";
//Which results in `ast`.
Consider following syntax, where each symbol from value will be replaced with it’s key symbol (case-sensitive)
var diacritics = {
a: 'ÀÁÂÃÄÅàáâãäåĀāąĄ',
c: 'ÇçćĆčČ',
d: 'đĐďĎ',
e: 'ÈÉÊËèéêëěĚĒēęĘ',
i: 'ÌÍÎÏìíîïĪī',
l: 'łŁ',
n: 'ÑñňŇńŃ',
o: 'ÒÓÔÕÕÖØòóôõöøŌō',
r: 'řŘ',
s: 'ŠšśŚ',
t: 'ťŤ',
u: 'ÙÚÛÜùúûüůŮŪū',
y: 'ŸÿýÝ',
z: 'ŽžżŻźŹ'
}
function replaceDiacritics(text) {
for(var toLetter in diacritics) if(diacritics.hasOwnProperty(toLetter)) {
for(var i = 0, ii = diacritics[toLetter].length, fromLetter, toCaseLetter; i < ii; i++) {
fromLetter = diacritics[toLetter][i];
if(text.indexOf(fromLetter) < 0) continue;
toCaseLetter = fromLetter == fromLetter.toUpperCase() ? toLetter.toUpperCase() : toLetter;
text = text.replace(new RegExp(fromLetter, 'g'), toCaseLetter);
}
}
return text;
}
replaceDiacritics('ŁÁŘŠÓÑ'); // LARSON
replaceDiacritics('Łąřśøń'); // Larson
You would need a conversion map, something like this:
function removeAccents(str) {
var convMap = {
"ă" : "a",
"ş" : "s",
"ţ" : "t"
}
for (var i in convMap) {
str = str.replace(new RegExp(i, "g"), convMap[i]);
}
return str;
}
Or if you have access to iconv
on your box, you could perhaps use some ajax calls to remove the accents with iconv’s //TRANSLIT parameter.
This method is a refactor of the first response.
function replaceDiacritics(str){
var diacritics = [
{char: 'A', base: /[300-306]/g},
{char: 'a', base: /[340-346]/g},
{char: 'E', base: /[310-313]/g},
{char: 'e', base: /[350-353]/g},
{char: 'I', base: /[314-317]/g},
{char: 'i', base: /[354-357]/g},
{char: 'O', base: /[322-330]/g},
{char: 'o', base: /[362-370]/g},
{char: 'U', base: /[331-334]/g},
{char: 'u', base: /[371-374]/g},
{char: 'N', base: /[321]/g},
{char: 'n', base: /[361]/g},
{char: 'C', base: /[307]/g},
{char: 'c', base: /[347]/g}
]
diacritics.forEach(function(letter){
str = str.replace(letter.base, letter.char);
});
return str;
};
Try this.
function replaceDiacritics(text) {
const diacr = "ÁÄČÇĎÉĚËÍŇÓÖŘŠŤÚŮÜÝŽáäčçďéěëíňóöřšťúůüýž";
const diacrSubs = "AACCDEEEINOORSTUUUYZaaccdeeeinoorstuuuyz";
if (text.length < diacr.length) {
for (let i = 0; i < text.length; i++) {
let indexOfSpecChar = diacr.indexOf(text[i]);
if (indexOfSpecChar > -1) {
text = text.replace(new RegExp(diacr[indexOfSpecChar], 'g'), diacrSubs[indexOfSpecChar]);
}
}
} else {
for (let j = 0; j < diacr.length; j++) {
text = text.replace(new RegExp(diacr[j], 'g'), diacrSubs[j]);
}
}
return text;
}
Here is ES2020 version of the accepted low-level solution. If You can recall the letters that need to be replaced, create a mapping and use String.replace()
to map letters one-by-one:
const mapping = {
'ă': 'a',
'ş': 's',
'ţ': 't'
}
const replaceDiacritics = s => s.split('').map(letter => mapping[letter] ?? letter).join('')