Regular Expression Patterns - Python Programming

Real applications need to validate user input, extract data from logs, and parse structured text. This page provides ready-to-use regex patterns for common tasks: email validation, phone number formatting, URL extraction, and data parsing. These patterns serve as building blocks for text processing pipelines.

Splitting with Patterns

email.py

# Email validation patterns

import re

# Email patterns
# Basic email pattern
basic_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

emails = [
    "user@example.com",
    "first.last@domain.co.uk",
    "user+tag@example.org",
    "invalid@",
    "@invalid.com",
    "no-at-sign.com",
    "user@domain",
    "user@domain.c"
]

print("Basic email validation:")
pattern = re.compile(basic_email)
for email in emails:
    valid = bool(pattern.match(email))
    print(f"  {email}: {valid}")

# Extract email parts
email_pattern = r"^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})$"
extract_pattern = re.compile(email_pattern)

print("\nExtract email parts:")
test_email = 
match = extract_pattern.match(test_email)

if match:
    print(f"  Email: {test_email}")
    print(f"  Username: {match.group(1)}")
    print(f"  Domain: {match.group(2)}")
    print(f"  TLD: {match.group(3)}")

# Find all emails in text
text = """
Contact us at support@example.com or sales@company.org.
For urgent matters, email admin@service.net immediately.
"""

find_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
found_emails = find_pattern.findall(text)

print("\nEmails found in text:")
for email in found_emails:
    print(f"  {email}")

# More strict pattern (requires valid TLD length)
strict_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$"
strict_pattern = re.compile(strict_email)

print("\nStrict validation:")
test_emails = [
    "user@domain.com",
    "user@domain.co",
    "user@domain.technology"  # 10 chars TLD
]

for email in test_emails:
    valid = bool(strict_pattern.match(email))
    print(f"  {email}: {valid}")

# Named groups
named_pattern = r"^(?P<username>[a-zA-Z0-9._%+-]+)@(?P<domain>[a-zA-Z0-9.-]+)\.(?P<tld>[a-zA-Z]{2,})$"
named_re = re.compile(named_pattern)

match2 = named_re.match("alice@example.com")
if match2:
    print("\nNamed groups:")
    print(f"  Username: {match2.group('username')}")
    print(f"  Domain: {match2.group('domain')}")
    print(f"  TLD: {match2.group('tld')}")

# Email validation patterns

import re

# Email patterns
# Basic email pattern
basic_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

emails = [
    "user@example.com",
    "first.last@domain.co.uk",
    "user+tag@example.org",
    "invalid@",
    "@invalid.com",
    "no-at-sign.com",
    "user@domain",
    "user@domain.c"
]

print("Basic email validation:")
pattern = re.compile(basic_email)
for email in emails:
    valid = bool(pattern.match(email))
    print(f"  {email}: {valid}")

# Extract email parts
email_pattern = r"^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})$"
extract_pattern = re.compile(email_pattern)

print("\nExtract email parts:")
test_email = 
match = extract_pattern.match(test_email)

if match:
    print(f"  Email: {test_email}")
    print(f"  Username: {match.group(1)}")
    print(f"  Domain: {match.group(2)}")
    print(f"  TLD: {match.group(3)}")

# Find all emails in text
text = """
Contact us at support@example.com or sales@company.org.
For urgent matters, email admin@service.net immediately.
"""

find_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
found_emails = find_pattern.findall(text)

print("\nEmails found in text:")
for email in found_emails:
    print(f"  {email}")

# More strict pattern (requires valid TLD length)
strict_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$"
strict_pattern = re.compile(strict_email)

print("\nStrict validation:")
test_emails = [
    "user@domain.com",
    "user@domain.co",
    "user@domain.technology"  # 10 chars TLD
]

for email in test_emails:
    valid = bool(strict_pattern.match(email))
    print(f"  {email}: {valid}")

# Named groups
named_pattern = r"^(?P<username>[a-zA-Z0-9._%+-]+)@(?P<domain>[a-zA-Z0-9.-]+)\.(?P<tld>[a-zA-Z]{2,})$"
named_re = re.compile(named_pattern)

match2 = named_re.match("alice@example.com")
if match2:
    print("\nNamed groups:")
    print(f"  Username: {match2.group('username')}")
    print(f"  Domain: {match2.group('domain')}")
    print(f"  TLD: {match2.group('tld')}")

# Email validation patterns

import re

# Email patterns
# Basic email pattern
basic_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"

emails = [
    "user@example.com",
    "first.last@domain.co.uk",
    "user+tag@example.org",
    "invalid@",
    "@invalid.com",
    "no-at-sign.com",
    "user@domain",
    "user@domain.c"
]

print("Basic email validation:")
pattern = re.compile(basic_email)
for email in emails:
    valid = bool(pattern.match(email))
    print(f"  {email}: {valid}")

# Extract email parts
email_pattern = r"^([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})$"
extract_pattern = re.compile(email_pattern)

print("\nExtract email parts:")
test_email = 
match = extract_pattern.match(test_email)

if match:
    print(f"  Email: {test_email}")
    print(f"  Username: {match.group(1)}")
    print(f"  Domain: {match.group(2)}")
    print(f"  TLD: {match.group(3)}")

# Find all emails in text
text = """
Contact us at support@example.com or sales@company.org.
For urgent matters, email admin@service.net immediately.
"""

find_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
found_emails = find_pattern.findall(text)

print("\nEmails found in text:")
for email in found_emails:
    print(f"  {email}")

# More strict pattern (requires valid TLD length)
strict_email = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$"
strict_pattern = re.compile(strict_email)

print("\nStrict validation:")
test_emails = [
    "user@domain.com",
    "user@domain.co",
    "user@domain.technology"  # 10 chars TLD
]

for email in test_emails:
    valid = bool(strict_pattern.match(email))
    print(f"  {email}: {valid}")

# Named groups
named_pattern = r"^(?P<username>[a-zA-Z0-9._%+-]+)@(?P<domain>[a-zA-Z0-9.-]+)\.(?P<tld>[a-zA-Z]{2,})$"
named_re = re.compile(named_pattern)

match2 = named_re.match("alice@example.com")
if match2:
    print("\nNamed groups:")
    print(f"  Username: {match2.group('username')}")
    print(f"  Domain: {match2.group('domain')}")
    print(f"  TLD: {match2.group('tld')}")

phone.py

# Phone number patterns

import re

# Phone patterns
# US phone formats
patterns = [
    r"\(\d{3}\)\s\d{3}-\d{4}",  # (123) 456-7890
    r"\d{3}-\d{3}-\d{4}",        # 123-456-7890
    r"\d{10}"                    # 1234567890
]

phones = [
    "(555) 123-4567",
    "555-123-4567",
    "5551234567",
    "(555)123-4567",  # no space
    "555.123.4567",
    "invalid"
]

print("Phone validation:")
for i, pattern_str in enumerate(patterns, 1):
    print(f"\nPattern {i}:")
    p = re.compile(pattern_str)
    for phone in phones:
        valid = bool(p.fullmatch(phone))
        print(f"  {phone}: {valid}")

# Combined pattern (any format)
any_format = r"^(\(\d{3}\)\s?|\d{3}-)?\d{3}-?\d{4}$"
any_pattern = re.compile(any_format)

print("\nCombined pattern:")
for phone in phones:
    valid = bool(any_pattern.match(phone))
    print(f"  {phone}: {valid}")

# Extract phone parts
extract_pattern = r"\((\d{3})\)\s(\d{3})-(\d{4})"
extract = re.compile(extract_pattern)

test_phone = "(555) 123-4567"
match = extract.match(test_phone)

if match:
    print(f"\nExtracted parts from {test_phone}:")
    print(f"  Area code: {match.group(1)}")
    print(f"  Exchange: {match.group(2)}")
    print(f"  Number: {match.group(3)}")

# Find all phones in text
text = """
Call us at (555) 123-4567 or 555-987-6543.
Emergency: (999) 911-0000
"""

find_pattern = re.compile(r"\(?\d{3}\)?[-\s]?\d{3}-\d{4}")
found = find_pattern.findall(text)

print("\nPhones found in text:")
for phone in found:
    print(f"  {phone}")

# International format (basic)
intl_pattern = r"^\+?\d{1,3}[-\s]?\(?\d{1,4}\)?[-\s]?\d{1,4}[-\s]?\d{1,9}$"
intl_p = re.compile(intl_pattern)

intl_phones = [
    "+1 (555) 123-4567",
    "+44 20 7123 4567",
    "+81 3-1234-5678"
]

print("\nInternational phones:")
for phone in intl_phones:
    valid = bool(intl_p.match(phone))
    print(f"  {phone}: {valid}")

# Format phone number
def format_phone(digits):
    """Convert 10 digits to (XXX) XXX-XXXX format"""
    if len(digits) == 10 and digits.isdigit():
        return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
    return None

print("\nFormat phone:")
raw = "5551234567"
formatted = format_phone(raw)
print(f"  {raw} -> {formatted}")

url.py

# URL patterns

import re

# URL patterns
# Basic URL pattern
url_pattern = r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}.*$"

urls = [
    "https://example.com",
    "http://www.example.com",
    "https://sub.example.co.uk",
    "https://example.com/path",
    "https://example.com/path?key=value",
    "ftp://example.com",  # wrong protocol
    "example.com",        # missing protocol
    "https://localhost"   # no TLD
]

print("URL validation:")
pattern = re.compile(url_pattern)
for url in urls:
    valid = bool(pattern.match(url))
    print(f"  {url}: {valid}")

# Extract URL parts
extract_pattern = r"^(https?)://([a-zA-Z0-9.-]+)(/.*)? $"
extract_p = re.compile(extract_pattern)

test_url = "https://www.example.com/path/to/page"
match = extract_p.match(test_url)

if match:
    print(f"\nExtracted parts from {test_url}:")
    print(f"  Protocol: {match.group(1)}")
    print(f"  Domain: {match.group(2)}")
    print(f"  Path: {match.group(3) if match.group(3) else '/'}")

# More detailed extraction
detail_pattern = r"^(https?)://([^:/]+)(?::(\d+))?(/.*)?$"
detail_p = re.compile(detail_pattern)

test_urls = [
    "https://example.com:8080/path",
    "http://localhost:3000/api",
    "https://example.com/page"
]

print("\nDetailed URL parsing:")
for url in test_urls:
    m = detail_p.match(url)
    if m:
        print(f"  {url}")
        print(f"    Protocol: {m.group(1)}")
        print(f"    Host: {m.group(2)}")
        print(f"    Port: {m.group(3) if m.group(3) else 'default'}")
        print(f"    Path: {m.group(4) if m.group(4) else '/'}")

# Find all URLs in text
text = """
Visit https://example.com for more info.
Check out http://test.org/page and https://another.site/path?q=search
"""

find_pattern = re.compile(r"https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}[^\s]*")
found = find_pattern.findall(text)

print("\nURLs found in text:")
for url in found:
    print(f"  {url}")

# Query parameters
query_pattern = r"[?&]([^=]+)=([^&]+)"
url_with_query = "https://example.com/search?q=regex&lang=python&page=1"

query_p = re.compile(query_pattern)
params = query_p.findall(url_with_query)

print(f"\nQuery parameters from: {url_with_query}")
for key, value in params:
    print(f"  {key} = {value}")

# Named groups for URL parsing
named_pattern = r"^(?P<protocol>https?)://(?P<domain>[^:/]+)(?::(?P<port>\d+))?(?P<path>/.*)?$"
named_p = re.compile(named_pattern)

match2 = named_p.match("https://example.com:8080/api/v1")
if match2:
    print("\nNamed groups:")
    print(f"  Protocol: {match2.group('protocol')}")
    print(f"  Domain: {match2.group('domain')}")
    print(f"  Port: {match2.group('port')}")
    print(f"  Path: {match2.group('path')}")

extraction.py

# Text extraction with regex

import re

# Extract hashtags
def extract_hashtags(text):
    """Extract all hashtags from text"""
    return re.findall(r"#\w+", text)

# Extract mentions
def extract_mentions(text):
    """Extract all @mentions from text"""
    return re.findall(r"@\w+", text)

# Extract numbers
def extract_numbers(text):
    """Extract all numbers (including decimals and negatives)"""
    matches = re.findall(r"-?\d+\.?\d*", text)
    return [float(m) for m in matches if m and m != '-']

# Extract dates
def extract_dates(text):
    """Extract dates in YYYY-MM-DD format"""
    return re.findall(r"\d{4}-\d{2}-\d{2}", text)

# Main test
if __name__ == "__main__":
    # Social media text
    tweet = """
    Loving #python and #regex! Thanks @copilot for the help.
    Check out #programming tips at https://example.com
    Mentions: @user1 @user2 #coding
    """

    print("Social media extraction:")
    print(f"Hashtags: {extract_hashtags(tweet)}")
    print(f"Mentions: {extract_mentions(tweet)}")

    # Numbers
    data_text = "Prices: $19.99, $5.50, and $100. Temperature: -5.5°C"
    print("\nNumbers extraction:")
    print(f"Numbers: {extract_numbers(data_text)}")

    # Dates
    log_text = """
    2025-01-29: Error occurred
    2025-01-30: Fixed bug
    2025-02-01: Deployed
    """
    print("\nDates extraction:")
    print(f"Dates: {extract_dates(log_text)}")

    # IP addresses
    server_log = "Requests from 192.168.1.1, 10.0.0.5, and 172.16.0.10"
    ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", server_log)
    print("\nIP addresses:")
    for ip in ips:
        print(f"  {ip}")

    # Extract quoted strings
    quote_text = 'He said "Hello" and she replied "Hi there!"'
    quotes = re.findall(r'"([^"]+)"', quote_text)
    print("\nQuoted strings:")
    for quote in quotes:
        print(f"  {quote}")

    # Key-value pairs
    config = "name=John age=30 city=NYC email=john@example.com"
    kv_pairs = re.findall(r"(\w+)=(\S+)", config)
    print("\nKey-value pairs:")
    for key, value in kv_pairs:
        print(f"  {key} = {value}")

    # HTML tags (simple)
    html = "<div>Content</div><span>Text</span>"
    # Using backreference \1 to match closing tag
    tags = re.findall(r"<(\w+)>([^<]+)</\1>", html)
    print("\nHTML content:")
    for tag, content in tags:
        print(f"  <{tag}>: {content}")

    # Email addresses
    text_with_emails = "Contact alice@example.com or bob@test.org for info"
    emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text_with_emails)
    print("\nEmail addresses:")
    for email in emails:
        print(f"  {email}")

    # Extract words
    sentence = "The quick-brown fox jumps over the lazy dog."
    words = re.findall(r"\b\w+\b", sentence)
    print(f"\nWords: {words}")

    # Extract capitalized words
    caps = re.findall(r"\b[A-Z]\w*\b", "Python, Java, and JavaScript are Languages")
    print(f"Capitalized words: {caps}")

replacement.py

# String replacement with regex

import re

# Basic replacement
# Simple sub
text1 = "Hello World, Hello Python"
result1 = re.sub(r"Hello", "Hi", text1)
print(f"Original: {text1}")
print(f"Replaced: {result1}")

# Replace with pattern
text2 = "Call 555-1234 or 555-5678"
result2 = re.sub(r"\d{3}-\d{4}", "XXX-XXXX", text2)
print("\nMask phone numbers:")
print(f"Original: {text2}")
print(f"Masked: {result2}")

# Replace with groups
text3 = "2025-01-29"
result3 = re.sub(r"(\d{4})-(\d{2})-(\d{2})", r"\2/\3/\1", text3)
print("\nReformat date:")
print(f"Original (YYYY-MM-DD): {text3}")
print(f"Reformatted (MM/DD/YYYY): {result3}")

# Remove extra whitespace
text4 = "Too    many     spaces"
result4 = re.sub(r"\s+", " ", text4)
print("\nNormalize whitespace:")
print(f"Original: '{text4}'")
print(f"Normalized: '{result4}'")

# Remove HTML tags
html = "<p>Hello <b>World</b></p>"
result5 = re.sub(r"<[^>]+>", "", html)
print("\nRemove HTML:")
print(f"Original: {html}")
print(f"Clean: {result5}")

# Censor profanity (example)
text6 = "This is bad and terrible"
result6 = re.sub(r"\b(bad|terrible)\b", "***", text6)
print("\nCensor words:")
print(f"Original: {text6}")
print(f"Censored: {result6}")

# Format currency
text7 = "Price: 1234.56"
result7 = re.sub(r"(\d+)", r"$\1", text7)
print("\nAdd currency:")
print(f"Original: {text7}")
print(f"Formatted: {result7}")

# Replacement with function
def uppercase_match(match):
    """Convert matched word to uppercase"""
    return match.group(0).upper()

text8 = "one two three"
result8 = re.sub(r"\w+", uppercase_match, text8)
print("\nCustom replacement (function):")
print(f"Original: {text8}")
print(f"Uppercase: {result8}")

# Advanced: swap first and last name
names = "John Doe, Jane Smith, Bob Johnson"
result9 = re.sub(r"(\w+)\s(\w+)", r"\2, \1", names)
print("\nSwap names:")
print(f"Original: {names}")
print(f"Swapped: {result9}")

# Count replacements
text10 = "apple apple banana apple"
result10, count = re.subn(r"apple", "orange", text10)
print(f"\nReplace with count:")
print(f"Original: {text10}")
print(f"Replaced: {result10}")
print(f"Replacements: {count}")

# Replace only first N occurrences
text11 = "a a a a a"
result11 = re.sub(r"a", "b", text11, count=3)
print(f"\nReplace first 3:")
print(f"Original: {text11}")
print(f"Result: {result11}")

# Named groups in replacement
text12 = "John Doe"
pattern = r"(?P<first>\w+)\s(?P<last>\w+)"
result12 = re.sub(pattern, r"\g<last>, \g<first>", text12)
print(f"\nNamed group replacement:")
print(f"Original: {text12}")
print(f"Result: {result12}")

splitting.py

# String splitting with regex

import re

# Basic splitting
# Split by comma
csv1 = "apple,banana,cherry"
parts1 = csv1.split(",")
print("Split by comma:")
print(f"  {parts1}")

# Split by regex whitespace
text1 = "one two  three   four"
parts2 = re.split(r"\s+", text1)
print("\nSplit by whitespace:")
print(f"  {parts2}")

# Split by multiple delimiters
text2 = "apple;banana,cherry:orange"
parts3 = re.split(r"[;,:]+", text2)
print("\nSplit by multiple delimiters:")
print(f"  {parts3}")

# Split with limit (maxsplit)
text3 = "one,two,three,four,five"
parts4 = re.split(r",", text3, maxsplit=2)
print("\nSplit with maxsplit (2):")
print(f"  {parts4}")

# Split preserving delimiters (use groups)
text4 = "one,two;three"
parts5 = re.split(r"([,;])", text4)
print("\nSplit preserving delimiters:")
print(f"  {parts5}")

# Split by word boundaries
text5 = "hello-world_test"
parts6 = re.split(r"[-_]", text5)
print("\nSplit by hyphens and underscores:")
print(f"  {parts6}")

# Split sentences
paragraph = "First sentence. Second sentence! Third question?"
sentences = re.split(r"[.!?]\s*", paragraph)
print("\nSplit sentences:")
for i, sent in enumerate(sentences, 1):
    if sent:  # skip empty strings
        print(f"  {i}: {sent}")

# Split keeping empty strings
text6 = "a,,b,,,c"
parts7 = text6.split(",")  # regular split
parts8 = re.split(r",", text6)  # regex split
print("\nRegular split:")
print(f"  {parts7}")
print("Regex split (same behavior):")
print(f"  {parts8}")

# Split path
path = r"C:\Users\John\Documents\file.txt"
path_parts = re.split(r"\\", path)
print("\nSplit Windows path:")
print(f"  {path_parts}")

# Split by digits
text7 = "abc123def456ghi"
parts9 = re.split(r"\d+", text7)
print("\nSplit by digits:")
print(f"  {parts9}")

# Compiled pattern for reuse
pattern = re.compile(r"\s*,\s*")  # comma with optional spaces
text8 = "a, b,c ,d , e"
parts10 = pattern.split(text8)
print("\nSplit CSV with spaces:")
print(f"  {parts10}")

# Split complex: key=value pairs
config = "name=John;age=30;city=NYC"
pairs = config.split(";")
print("\nParse config:")
for pair in pairs:
    key, value = pair.split("=")
    print(f"  {key} -> {value}")

# Split with capturing groups
text9 = "a1b2c3"
parts11 = re.split(r"(\d)", text9)
print("\nSplit with captured delimiters:")
print(f"  {parts11}")

# Split by lookahead (keep delimiter)
text10 = "HelloWorld"
parts12 = re.split(r"(?=[A-Z])", text10)
print("\nSplit before capitals:")
print(f"  {parts12}")

# Split emails
emails = "alice@example.com, bob@test.org; charlie@demo.net"
email_list = re.split(r"[,;]\s*", emails)
print("\nSplit email list:")
for email in email_list:
    print(f"  {email}")

Pattern Building Tips

Start simple, add complexity
Test with edge cases
Use raw strings (r"...")
Use groups for extraction
Balance strictness vs flexibility

email_pattern Pattern for validating and extracting email addresses

phone_pattern Patterns for various phone number formats including international

url_pattern Pattern for matching web URLs with http/https

text_extraction Extracting hashtags, mentions, numbers, and dates from text

regex_replacement Using re.sub() to transform text with patterns

regex_splitting Using re.split() to divide text on complex delimiters

Exercise: extraction.py

Extract all emails, URLs, and hashtags from a sample social media post