String Processing
Regular Expression Patterns
Real applications need to validate user input, extract data from logs, and parse structured text. This page provides ready-to-use regex patterns for common tasks: email validation, phone number formatting, URL extraction, and data parsing. These patterns serve as building blocks for text processing pipelines.
Splitting with Patterns
email.py
# Email validation patterns
import re
# Email patterns
# Basic email pattern
basic_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
emails = [
"user@example.com",
"first.last@domain.co.uk",
"user+tag@example.org",
"invalid@",
"@invalid.com",
"no-at-sign.com",
"user@domain",
"user@domain.c"
]
print("Basic email validation:")
pattern = re.compile(basic_email)
for email in emails:
valid = bool(pattern.match(email))
print(f" {email}: {valid}")
# Extract email parts
email_pattern = r"^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})$"
extract_pattern = re.compile(email_pattern)
print("\nExtract email parts:")
test_email =
match = extract_pattern.match(test_email)
if match:
print(f" Email: {test_email}")
print(f" Username: {match.group(1)}")
print(f" Domain: {match.group(2)}")
print(f" TLD: {match.group(3)}")
# Find all emails in text
text = """
Contact us at support@example.com or sales@company.org.
For urgent matters, email admin@service.net immediately.
"""
find_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
found_emails = find_pattern.findall(text)
print("\nEmails found in text:")
for email in found_emails:
print(f" {email}")
# More strict pattern (requires valid TLD length)
strict_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$"
strict_pattern = re.compile(strict_email)
print("\nStrict validation:")
test_emails = [
"user@domain.com",
"user@domain.co",
"user@domain.technology" # 10 chars TLD
]
for email in test_emails:
valid = bool(strict_pattern.match(email))
print(f" {email}: {valid}")
# Named groups
named_pattern = r"^(?P<username>[a-zA-Z0-9._%+-]+)@(?P<domain>[a-zA-Z0-9.-]+)\.(?P<tld>[a-zA-Z]{2,})$"
named_re = re.compile(named_pattern)
match2 = named_re.match("alice@example.com")
if match2:
print("\nNamed groups:")
print(f" Username: {match2.group('username')}")
print(f" Domain: {match2.group('domain')}")
print(f" TLD: {match2.group('tld')}")
# Email validation patterns
import re
# Email patterns
# Basic email pattern
basic_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
emails = [
"user@example.com",
"first.last@domain.co.uk",
"user+tag@example.org",
"invalid@",
"@invalid.com",
"no-at-sign.com",
"user@domain",
"user@domain.c"
]
print("Basic email validation:")
pattern = re.compile(basic_email)
for email in emails:
valid = bool(pattern.match(email))
print(f" {email}: {valid}")
# Extract email parts
email_pattern = r"^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})$"
extract_pattern = re.compile(email_pattern)
print("\nExtract email parts:")
test_email =
match = extract_pattern.match(test_email)
if match:
print(f" Email: {test_email}")
print(f" Username: {match.group(1)}")
print(f" Domain: {match.group(2)}")
print(f" TLD: {match.group(3)}")
# Find all emails in text
text = """
Contact us at support@example.com or sales@company.org.
For urgent matters, email admin@service.net immediately.
"""
find_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
found_emails = find_pattern.findall(text)
print("\nEmails found in text:")
for email in found_emails:
print(f" {email}")
# More strict pattern (requires valid TLD length)
strict_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$"
strict_pattern = re.compile(strict_email)
print("\nStrict validation:")
test_emails = [
"user@domain.com",
"user@domain.co",
"user@domain.technology" # 10 chars TLD
]
for email in test_emails:
valid = bool(strict_pattern.match(email))
print(f" {email}: {valid}")
# Named groups
named_pattern = r"^(?P<username>[a-zA-Z0-9._%+-]+)@(?P<domain>[a-zA-Z0-9.-]+)\.(?P<tld>[a-zA-Z]{2,})$"
named_re = re.compile(named_pattern)
match2 = named_re.match("alice@example.com")
if match2:
print("\nNamed groups:")
print(f" Username: {match2.group('username')}")
print(f" Domain: {match2.group('domain')}")
print(f" TLD: {match2.group('tld')}")
# Email validation patterns
import re
# Email patterns
# Basic email pattern
basic_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
emails = [
"user@example.com",
"first.last@domain.co.uk",
"user+tag@example.org",
"invalid@",
"@invalid.com",
"no-at-sign.com",
"user@domain",
"user@domain.c"
]
print("Basic email validation:")
pattern = re.compile(basic_email)
for email in emails:
valid = bool(pattern.match(email))
print(f" {email}: {valid}")
# Extract email parts
email_pattern = r"^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})$"
extract_pattern = re.compile(email_pattern)
print("\nExtract email parts:")
test_email =
match = extract_pattern.match(test_email)
if match:
print(f" Email: {test_email}")
print(f" Username: {match.group(1)}")
print(f" Domain: {match.group(2)}")
print(f" TLD: {match.group(3)}")
# Find all emails in text
text = """
Contact us at support@example.com or sales@company.org.
For urgent matters, email admin@service.net immediately.
"""
find_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
found_emails = find_pattern.findall(text)
print("\nEmails found in text:")
for email in found_emails:
print(f" {email}")
# More strict pattern (requires valid TLD length)
strict_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$"
strict_pattern = re.compile(strict_email)
print("\nStrict validation:")
test_emails = [
"user@domain.com",
"user@domain.co",
"user@domain.technology" # 10 chars TLD
]
for email in test_emails:
valid = bool(strict_pattern.match(email))
print(f" {email}: {valid}")
# Named groups
named_pattern = r"^(?P<username>[a-zA-Z0-9._%+-]+)@(?P<domain>[a-zA-Z0-9.-]+)\.(?P<tld>[a-zA-Z]{2,})$"
named_re = re.compile(named_pattern)
match2 = named_re.match("alice@example.com")
if match2:
print("\nNamed groups:")
print(f" Username: {match2.group('username')}")
print(f" Domain: {match2.group('domain')}")
print(f" TLD: {match2.group('tld')}")
phone.py
# Phone number patterns
import re
# Phone patterns
# US phone formats
patterns = [
r"\(\d{3}\)\s\d{3}-\d{4}", # (123) 456-7890
r"\d{3}-\d{3}-\d{4}", # 123-456-7890
r"\d{10}" # 1234567890
]
phones = [
"(555) 123-4567",
"555-123-4567",
"5551234567",
"(555)123-4567", # no space
"555.123.4567",
"invalid"
]
print("Phone validation:")
for i, pattern_str in enumerate(patterns, 1):
print(f"\nPattern {i}:")
p = re.compile(pattern_str)
for phone in phones:
valid = bool(p.fullmatch(phone))
print(f" {phone}: {valid}")
# Combined pattern (any format)
any_format = r"^(\(\d{3}\)\s?|\d{3}-)?\d{3}-?\d{4}$"
any_pattern = re.compile(any_format)
print("\nCombined pattern:")
for phone in phones:
valid = bool(any_pattern.match(phone))
print(f" {phone}: {valid}")
# Extract phone parts
extract_pattern = r"\((\d{3})\)\s(\d{3})-(\d{4})"
extract = re.compile(extract_pattern)
test_phone = "(555) 123-4567"
match = extract.match(test_phone)
if match:
print(f"\nExtracted parts from {test_phone}:")
print(f" Area code: {match.group(1)}")
print(f" Exchange: {match.group(2)}")
print(f" Number: {match.group(3)}")
# Find all phones in text
text = """
Call us at (555) 123-4567 or 555-987-6543.
Emergency: (999) 911-0000
"""
find_pattern = re.compile(r"\(?\d{3}\)?[-\s]?\d{3}-\d{4}")
found = find_pattern.findall(text)
print("\nPhones found in text:")
for phone in found:
print(f" {phone}")
# International format (basic)
intl_pattern = r"^\+?\d{1,3}[-\s]?\(?\d{1,4}\)?[-\s]?\d{1,4}[-\s]?\d{1,9}$"
intl_p = re.compile(intl_pattern)
intl_phones = [
"+1 (555) 123-4567",
"+44 20 7123 4567",
"+81 3-1234-5678"
]
print("\nInternational phones:")
for phone in intl_phones:
valid = bool(intl_p.match(phone))
print(f" {phone}: {valid}")
# Format phone number
def format_phone(digits):
"""Convert 10 digits to (XXX) XXX-XXXX format"""
if len(digits) == 10 and digits.isdigit():
return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
return None
print("\nFormat phone:")
raw = "5551234567"
formatted = format_phone(raw)
print(f" {raw} -> {formatted}")
url.py
# URL patterns
import re
# URL patterns
# Basic URL pattern
url_pattern = r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}.*$"
urls = [
"https://example.com",
"http://www.example.com",
"https://sub.example.co.uk",
"https://example.com/path",
"https://example.com/path?key=value",
"ftp://example.com", # wrong protocol
"example.com", # missing protocol
"https://localhost" # no TLD
]
print("URL validation:")
pattern = re.compile(url_pattern)
for url in urls:
valid = bool(pattern.match(url))
print(f" {url}: {valid}")
# Extract URL parts
extract_pattern = r"^(https?)://([a-zA-Z0-9.-]+)(/.*)? $"
extract_p = re.compile(extract_pattern)
test_url = "https://www.example.com/path/to/page"
match = extract_p.match(test_url)
if match:
print(f"\nExtracted parts from {test_url}:")
print(f" Protocol: {match.group(1)}")
print(f" Domain: {match.group(2)}")
print(f" Path: {match.group(3) if match.group(3) else '/'}")
# More detailed extraction
detail_pattern = r"^(https?)://([^:/]+)(?::(\d+))?(/.*)?$"
detail_p = re.compile(detail_pattern)
test_urls = [
"https://example.com:8080/path",
"http://localhost:3000/api",
"https://example.com/page"
]
print("\nDetailed URL parsing:")
for url in test_urls:
m = detail_p.match(url)
if m:
print(f" {url}")
print(f" Protocol: {m.group(1)}")
print(f" Host: {m.group(2)}")
print(f" Port: {m.group(3) if m.group(3) else 'default'}")
print(f" Path: {m.group(4) if m.group(4) else '/'}")
# Find all URLs in text
text = """
Visit https://example.com for more info.
Check out http://test.org/page and https://another.site/path?q=search
"""
find_pattern = re.compile(r"https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}[^\s]*")
found = find_pattern.findall(text)
print("\nURLs found in text:")
for url in found:
print(f" {url}")
# Query parameters
query_pattern = r"[?&]([^=]+)=([^&]+)"
url_with_query = "https://example.com/search?q=regex&lang=python&page=1"
query_p = re.compile(query_pattern)
params = query_p.findall(url_with_query)
print(f"\nQuery parameters from: {url_with_query}")
for key, value in params:
print(f" {key} = {value}")
# Named groups for URL parsing
named_pattern = r"^(?P<protocol>https?)://(?P<domain>[^:/]+)(?::(?P<port>\d+))?(?P<path>/.*)?$"
named_p = re.compile(named_pattern)
match2 = named_p.match("https://example.com:8080/api/v1")
if match2:
print("\nNamed groups:")
print(f" Protocol: {match2.group('protocol')}")
print(f" Domain: {match2.group('domain')}")
print(f" Port: {match2.group('port')}")
print(f" Path: {match2.group('path')}")
extraction.py
# Text extraction with regex
import re
# Extract hashtags
def extract_hashtags(text):
"""Extract all hashtags from text"""
return re.findall(r"#\w+", text)
# Extract mentions
def extract_mentions(text):
"""Extract all @mentions from text"""
return re.findall(r"@\w+", text)
# Extract numbers
def extract_numbers(text):
"""Extract all numbers (including decimals and negatives)"""
matches = re.findall(r"-?\d+\.?\d*", text)
return [float(m) for m in matches if m and m != '-']
# Extract dates
def extract_dates(text):
"""Extract dates in YYYY-MM-DD format"""
return re.findall(r"\d{4}-\d{2}-\d{2}", text)
# Main test
if __name__ == "__main__":
# Social media text
tweet = """
Loving #python and #regex! Thanks @copilot for the help.
Check out #programming tips at https://example.com
Mentions: @user1 @user2 #coding
"""
print("Social media extraction:")
print(f"Hashtags: {extract_hashtags(tweet)}")
print(f"Mentions: {extract_mentions(tweet)}")
# Numbers
data_text = "Prices: $19.99, $5.50, and $100. Temperature: -5.5°C"
print("\nNumbers extraction:")
print(f"Numbers: {extract_numbers(data_text)}")
# Dates
log_text = """
2025-01-29: Error occurred
2025-01-30: Fixed bug
2025-02-01: Deployed
"""
print("\nDates extraction:")
print(f"Dates: {extract_dates(log_text)}")
# IP addresses
server_log = "Requests from 192.168.1.1, 10.0.0.5, and 172.16.0.10"
ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", server_log)
print("\nIP addresses:")
for ip in ips:
print(f" {ip}")
# Extract quoted strings
quote_text = 'He said "Hello" and she replied "Hi there!"'
quotes = re.findall(r'"([^"]+)"', quote_text)
print("\nQuoted strings:")
for quote in quotes:
print(f" {quote}")
# Key-value pairs
config = "name=John age=30 city=NYC email=john@example.com"
kv_pairs = re.findall(r"(\w+)=(\S+)", config)
print("\nKey-value pairs:")
for key, value in kv_pairs:
print(f" {key} = {value}")
# HTML tags (simple)
html = "<div>Content</div><span>Text</span>"
# Using backreference \1 to match closing tag
tags = re.findall(r"<(\w+)>([^<]+)</\1>", html)
print("\nHTML content:")
for tag, content in tags:
print(f" <{tag}>: {content}")
# Email addresses
text_with_emails = "Contact alice@example.com or bob@test.org for info"
emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text_with_emails)
print("\nEmail addresses:")
for email in emails:
print(f" {email}")
# Extract words
sentence = "The quick-brown fox jumps over the lazy dog."
words = re.findall(r"\b\w+\b", sentence)
print(f"\nWords: {words}")
# Extract capitalized words
caps = re.findall(r"\b[A-Z]\w*\b", "Python, Java, and JavaScript are Languages")
print(f"Capitalized words: {caps}")
replacement.py
# String replacement with regex
import re
# Basic replacement
# Simple sub
text1 = "Hello World, Hello Python"
result1 = re.sub(r"Hello", "Hi", text1)
print(f"Original: {text1}")
print(f"Replaced: {result1}")
# Replace with pattern
text2 = "Call 555-1234 or 555-5678"
result2 = re.sub(r"\d{3}-\d{4}", "XXX-XXXX", text2)
print("\nMask phone numbers:")
print(f"Original: {text2}")
print(f"Masked: {result2}")
# Replace with groups
text3 = "2025-01-29"
result3 = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"\2/\3/\1", text3)
print("\nReformat date:")
print(f"Original (YYYY-MM-DD): {text3}")
print(f"Reformatted (MM/DD/YYYY): {result3}")
# Remove extra whitespace
text4 = "Too many spaces"
result4 = re.sub(r"\s+", " ", text4)
print("\nNormalize whitespace:")
print(f"Original: '{text4}'")
print(f"Normalized: '{result4}'")
# Remove HTML tags
html = "<p>Hello <b>World</b></p>"
result5 = re.sub(r"<[^>]+>", "", html)
print("\nRemove HTML:")
print(f"Original: {html}")
print(f"Clean: {result5}")
# Censor profanity (example)
text6 = "This is bad and terrible"
result6 = re.sub(r"\b(bad|terrible)\b", "***", text6)
print("\nCensor words:")
print(f"Original: {text6}")
print(f"Censored: {result6}")
# Format currency
text7 = "Price: 1234.56"
result7 = re.sub(r"(\d+)", r"$\1", text7)
print("\nAdd currency:")
print(f"Original: {text7}")
print(f"Formatted: {result7}")
# Replacement with function
def uppercase_match(match):
"""Convert matched word to uppercase"""
return match.group(0).upper()
text8 = "one two three"
result8 = re.sub(r"\w+", uppercase_match, text8)
print("\nCustom replacement (function):")
print(f"Original: {text8}")
print(f"Uppercase: {result8}")
# Advanced: swap first and last name
names = "John Doe, Jane Smith, Bob Johnson"
result9 = re.sub(r"(\w+)\s(\w+)", r"\2, \1", names)
print("\nSwap names:")
print(f"Original: {names}")
print(f"Swapped: {result9}")
# Count replacements
text10 = "apple apple banana apple"
result10, count = re.subn(r"apple", "orange", text10)
print(f"\nReplace with count:")
print(f"Original: {text10}")
print(f"Replaced: {result10}")
print(f"Replacements: {count}")
# Replace only first N occurrences
text11 = "a a a a a"
result11 = re.sub(r"a", "b", text11, count=3)
print(f"\nReplace first 3:")
print(f"Original: {text11}")
print(f"Result: {result11}")
# Named groups in replacement
text12 = "John Doe"
pattern = r"(?P<first>\w+)\s(?P<last>\w+)"
result12 = re.sub(pattern, r"\g<last>, \g<first>", text12)
print(f"\nNamed group replacement:")
print(f"Original: {text12}")
print(f"Result: {result12}")
splitting.py
# String splitting with regex
import re
# Basic splitting
# Split by comma
csv1 = "apple,banana,cherry"
parts1 = csv1.split(",")
print("Split by comma:")
print(f" {parts1}")
# Split by regex whitespace
text1 = "one two three four"
parts2 = re.split(r"\s+", text1)
print("\nSplit by whitespace:")
print(f" {parts2}")
# Split by multiple delimiters
text2 = "apple;banana,cherry:orange"
parts3 = re.split(r"[;,:]+", text2)
print("\nSplit by multiple delimiters:")
print(f" {parts3}")
# Split with limit (maxsplit)
text3 = "one,two,three,four,five"
parts4 = re.split(r",", text3, maxsplit=2)
print("\nSplit with maxsplit (2):")
print(f" {parts4}")
# Split preserving delimiters (use groups)
text4 = "one,two;three"
parts5 = re.split(r"([,;])", text4)
print("\nSplit preserving delimiters:")
print(f" {parts5}")
# Split by word boundaries
text5 = "hello-world_test"
parts6 = re.split(r"[-_]", text5)
print("\nSplit by hyphens and underscores:")
print(f" {parts6}")
# Split sentences
paragraph = "First sentence. Second sentence! Third question?"
sentences = re.split(r"[.!?]\s*", paragraph)
print("\nSplit sentences:")
for i, sent in enumerate(sentences, 1):
if sent: # skip empty strings
print(f" {i}: {sent}")
# Split keeping empty strings
text6 = "a,,b,,,c"
parts7 = text6.split(",") # regular split
parts8 = re.split(r",", text6) # regex split
print("\nRegular split:")
print(f" {parts7}")
print("Regex split (same behavior):")
print(f" {parts8}")
# Split path
path = r"C:\Users\John\Documents\file.txt"
path_parts = re.split(r"\\", path)
print("\nSplit Windows path:")
print(f" {path_parts}")
# Split by digits
text7 = "abc123def456ghi"
parts9 = re.split(r"\d+", text7)
print("\nSplit by digits:")
print(f" {parts9}")
# Compiled pattern for reuse
pattern = re.compile(r"\s*,\s*") # comma with optional spaces
text8 = "a, b,c ,d , e"
parts10 = pattern.split(text8)
print("\nSplit CSV with spaces:")
print(f" {parts10}")
# Split complex: key=value pairs
config = "name=John;age=30;city=NYC"
pairs = config.split(";")
print("\nParse config:")
for pair in pairs:
key, value = pair.split("=")
print(f" {key} -> {value}")
# Split with capturing groups
text9 = "a1b2c3"
parts11 = re.split(r"(\d)", text9)
print("\nSplit with captured delimiters:")
print(f" {parts11}")
# Split by lookahead (keep delimiter)
text10 = "HelloWorld"
parts12 = re.split(r"(?=[A-Z])", text10)
print("\nSplit before capitals:")
print(f" {parts12}")
# Split emails
emails = "alice@example.com, bob@test.org; charlie@demo.net"
email_list = re.split(r"[,;]\s*", emails)
print("\nSplit email list:")
for email in email_list:
print(f" {email}")
Pattern Building Tips
- Start simple, add complexity
- Test with edge cases
- Use raw strings (r"...")
- Use groups for extraction
- Balance strictness vs flexibility
email_pattern
Pattern for validating and extracting email addresses
phone_pattern
Patterns for various phone number formats including international
url_pattern
Pattern for matching web URLs with http/https
text_extraction
Extracting hashtags, mentions, numbers, and dates from text
regex_replacement
Using re.sub() to transform text with patterns
regex_splitting
Using re.split() to divide text on complex delimiters
Exercise: extraction.py
Extract all emails, URLs, and hashtags from a sample social media post