🔐Encoding & Escaping

Unicode & UTF-8 Handling

Work with Unicode characters, emojis, and UTF-8 encoding

Explanation

Unicode provides unique numbers for every character. UTF-8 is the most common encoding format for Unicode.

Examples

Emoji encoding
Input
😀
Output
U+1F600 or \uD83D\uDE00
Accented chars
Input
café
Output
c a f \u00E9
Unicode escape
Input
\u0048\u0065\u006C\u006C\u006F
Output
Hello

Code Examples

JavaScript
// String to Unicode code points
function stringToCodePoints(str) {
  return [...str].map(char => {
    const code = char.codePointAt(0);
    return {
      char,
      codePoint: code,
      hex: 'U+' + code.toString(16).toUpperCase().padStart(4, '0'),
      unicode: '\\u' + code.toString(16).padStart(4, '0')
    };
  });
}

// Usage
console.log(stringToCodePoints('Hello 😀'));
// [
//   { char: 'H', codePoint: 72, hex: 'U+0048', unicode: '\u0048' },
//   { char: 'e', codePoint: 101, hex: 'U+0065', unicode: '\u0065' },
//   ...
//   { char: '😀', codePoint: 128512, hex: 'U+1F600', unicode: '\u1f600' }
// ]

// Character from code point
const char = String.fromCodePoint(0x1F600); // 😀

// Unicode escape to string
function unescapeUnicode(str) {
  return str.replace(/\\u([0-9a-fA-F]{4})/g, (match, hex) => {
    return String.fromCharCode(parseInt(hex, 16));
  });
}

// String length (count code points, not UTF-16 units)
const str = 'Hello 😀';
console.log(str.length); // 8 (UTF-16 units - wrong for emojis)
console.log([...str].length); // 7 (code points - correct)

// Normalize Unicode (combining characters)
const str1 = 'café'; // e + combining acute
const str2 = 'café'; // é (single char)
console.log(str1 === str2); // false
console.log(str1.normalize() === str2.normalize()); // true

// Remove accents/diacritics
function removeAccents(str) {
  return str
    .normalize('NFD')
    .replace(/[\u0300-\u036f]/g, '');
}

console.log(removeAccents('café')); // 'cafe'

// Detect emoji
function containsEmoji(str) {
  const emojiRegex = /\p{Emoji}/u;
  return emojiRegex.test(str);
}

// Encode to UTF-8 bytes
function encodeUtf8(str) {
  return new TextEncoder().encode(str);
}

// Decode from UTF-8 bytes
function decodeUtf8(bytes) {
  return new TextDecoder().decode(bytes);
}

const bytes = encodeUtf8('café 😀');
console.log(bytes); // Uint8Array [99, 97, 102, 195, 169, 32, 240, 159, 152, 128]

// Check if string is valid UTF-8
function isValidUtf8(bytes) {
  try {
    new TextDecoder('utf-8', { fatal: true }).decode(bytes);
    return true;
  } catch {
    return false;
  }
}
Python
# String to Unicode code points
def string_to_codepoints(text):
    return [
        {
            'char': char,
            'codepoint': ord(char),
            'hex': f'U+{ord(char):04X}',
            'unicode': f'\\u{ord(char):04x}'
        }
        for char in text
    ]

# Usage
print(string_to_codepoints('Hello 😀'))

# Character from code point
char = chr(0x1F600)  # 😀

# Unicode escape to string
text = '\u0048\u0065\u006c\u006c\u006f'
decoded = text.encode().decode('unicode-escape')

# Encode to UTF-8 bytes
text = 'café 😀'
utf8_bytes = text.encode('utf-8')
print(utf8_bytes)  # b'caf\xc3\xa9 \xf0\x9f\x98\x80'

# Decode from UTF-8 bytes
decoded = utf8_bytes.decode('utf-8')

# Normalize Unicode
import unicodedata

str1 = 'café'  # e + combining acute
str2 = 'café'  # é (single char)
print(str1 == str2)  # False
print(unicodedata.normalize('NFC', str1) == unicodedata.normalize('NFC', str2))  # True

# Remove accents
def remove_accents(text):
    nfd = unicodedata.normalize('NFD', text)
    return ''.join(char for char in nfd if unicodedata.category(char) != 'Mn')

print(remove_accents('café'))  # 'cafe'

# Get Unicode name
print(unicodedata.name('😀'))  # 'GRINNING FACE'

# Find emoji
import re
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F]|"  # emoticons
    "[\U0001F300-\U0001F5FF]|"  # symbols & pictographs
    "[\U0001F680-\U0001F6FF]|"  # transport & map
    "[\U0001F1E0-\U0001F1FF]"   # flags
)

def contains_emoji(text):
    return bool(emoji_pattern.search(text))

💡 Tips

  • Always use UTF-8 encoding for files and databases
  • Use [...str] to iterate code points, not UTF-16 units
  • Normalize Unicode before comparison
  • Be careful with string length (emojis = 2+ UTF-16 units)
  • Use TextEncoder/TextDecoder in JavaScript
  • Handle combining characters with normalize()
  • Consider emoji support in fonts
  • Test with various languages and emoji

⚠️ Common Pitfalls

  • String.length wrong for emoji (counts UTF-16 units)
  • Emoji can be multiple code points (👨‍👩‍👧‍👦)
  • Not normalizing before comparison
  • Database column too short for UTF-8
  • Slicing strings can break emoji
  • Sorting without Unicode collation
  • Regex without \u flag may break