🔐Encoding & Escaping
Unicode & UTF-8 Handling
Work with Unicode characters, emojis, and UTF-8 encoding
Explanation
Unicode provides unique numbers for every character. UTF-8 is the most common encoding format for Unicode.
Examples
Emoji encoding
Input
😀
Output
U+1F600 or \uD83D\uDE00
Accented chars
Input
café
Output
c a f \u00E9
Unicode escape
Input
\u0048\u0065\u006C\u006C\u006F
Output
Hello
Code Examples
JavaScript
// String to Unicode code points
function stringToCodePoints(str) {
return [...str].map(char => {
const code = char.codePointAt(0);
return {
char,
codePoint: code,
hex: 'U+' + code.toString(16).toUpperCase().padStart(4, '0'),
unicode: '\\u' + code.toString(16).padStart(4, '0')
};
});
}
// Usage
console.log(stringToCodePoints('Hello 😀'));
// [
// { char: 'H', codePoint: 72, hex: 'U+0048', unicode: '\u0048' },
// { char: 'e', codePoint: 101, hex: 'U+0065', unicode: '\u0065' },
// ...
// { char: '😀', codePoint: 128512, hex: 'U+1F600', unicode: '\u1f600' }
// ]
// Character from code point
const char = String.fromCodePoint(0x1F600); // 😀
// Unicode escape to string
function unescapeUnicode(str) {
return str.replace(/\\u([0-9a-fA-F]{4})/g, (match, hex) => {
return String.fromCharCode(parseInt(hex, 16));
});
}
// String length (count code points, not UTF-16 units)
const str = 'Hello 😀';
console.log(str.length); // 8 (UTF-16 units - wrong for emojis)
console.log([...str].length); // 7 (code points - correct)
// Normalize Unicode (combining characters)
const str1 = 'café'; // e + combining acute
const str2 = 'café'; // é (single char)
console.log(str1 === str2); // false
console.log(str1.normalize() === str2.normalize()); // true
// Remove accents/diacritics
function removeAccents(str) {
return str
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '');
}
console.log(removeAccents('café')); // 'cafe'
// Detect emoji
function containsEmoji(str) {
const emojiRegex = /\p{Emoji}/u;
return emojiRegex.test(str);
}
// Encode to UTF-8 bytes
function encodeUtf8(str) {
return new TextEncoder().encode(str);
}
// Decode from UTF-8 bytes
function decodeUtf8(bytes) {
return new TextDecoder().decode(bytes);
}
const bytes = encodeUtf8('café 😀');
console.log(bytes); // Uint8Array [99, 97, 102, 195, 169, 32, 240, 159, 152, 128]
// Check if string is valid UTF-8
function isValidUtf8(bytes) {
try {
new TextDecoder('utf-8', { fatal: true }).decode(bytes);
return true;
} catch {
return false;
}
} Python
# String to Unicode code points
def string_to_codepoints(text):
return [
{
'char': char,
'codepoint': ord(char),
'hex': f'U+{ord(char):04X}',
'unicode': f'\\u{ord(char):04x}'
}
for char in text
]
# Usage
print(string_to_codepoints('Hello 😀'))
# Character from code point
char = chr(0x1F600) # 😀
# Unicode escape to string
text = '\u0048\u0065\u006c\u006c\u006f'
decoded = text.encode().decode('unicode-escape')
# Encode to UTF-8 bytes
text = 'café 😀'
utf8_bytes = text.encode('utf-8')
print(utf8_bytes) # b'caf\xc3\xa9 \xf0\x9f\x98\x80'
# Decode from UTF-8 bytes
decoded = utf8_bytes.decode('utf-8')
# Normalize Unicode
import unicodedata
str1 = 'café' # e + combining acute
str2 = 'café' # é (single char)
print(str1 == str2) # False
print(unicodedata.normalize('NFC', str1) == unicodedata.normalize('NFC', str2)) # True
# Remove accents
def remove_accents(text):
nfd = unicodedata.normalize('NFD', text)
return ''.join(char for char in nfd if unicodedata.category(char) != 'Mn')
print(remove_accents('café')) # 'cafe'
# Get Unicode name
print(unicodedata.name('😀')) # 'GRINNING FACE'
# Find emoji
import re
emoji_pattern = re.compile(
"[\U0001F600-\U0001F64F]|" # emoticons
"[\U0001F300-\U0001F5FF]|" # symbols & pictographs
"[\U0001F680-\U0001F6FF]|" # transport & map
"[\U0001F1E0-\U0001F1FF]" # flags
)
def contains_emoji(text):
return bool(emoji_pattern.search(text))💡 Tips
- Always use UTF-8 encoding for files and databases
- Use [...str] to iterate code points, not UTF-16 units
- Normalize Unicode before comparison
- Be careful with string length (emojis = 2+ UTF-16 units)
- Use TextEncoder/TextDecoder in JavaScript
- Handle combining characters with normalize()
- Consider emoji support in fonts
- Test with various languages and emoji
⚠️ Common Pitfalls
- String.length wrong for emoji (counts UTF-16 units)
- Emoji can be multiple code points (👨👩👧👦)
- Not normalizing before comparison
- Database column too short for UTF-8
- Slicing strings can break emoji
- Sorting without Unicode collation
- Regex without \u flag may break