String Processing
Regular Expressions Introduction
Validating email addresses, parsing log files, and extracting data from text are tedious with basic string methods. Regular expressions provide a powerful pattern language for matching, searching, and manipulating text. Python's re module makes these patterns accessible for validation, extraction, and text transformation.
Groups
literal.py
# Literal matching
import re
# Basic pattern matching
# Literal match
text1 =
match1 = re.match(r"hello", text1)
print(f"'hello' matches 'hello': {match1 is not None}")
match2 = re.match(r"world", text1)
print(f"'hello' matches 'world': {match2 is not None}")
# Full match vs partial match
print("\nFull match vs search:")
print(f"Full match 'hello' in 'hello world': {re.fullmatch(r'hello', 'hello world') is not None}")
print(f"Search 'hello' in 'hello world': {re.search(r'hello', 'hello world') is not None}")
# Compiled pattern
pattern = re.compile(r"hello")
match3 = pattern.search("hello world")
print(f"\nCompiled pattern found: {match3 is not None}")
# Case sensitivity
match4 = re.match(r"hello", "Hello")
print(f"\n'Hello' matches 'hello': {match4 is not None}")
# Case insensitive
match5 = re.match(r"hello", "Hello", re.IGNORECASE)
print(f"Case insensitive match: {match5 is not None}")
# Multiple occurrences
text2 = "hello hello hello"
matches = re.finditer(r"hello", text2)
print("\nFind all occurrences:")
for match in matches:
print(f" Found at index: {match.start()}")
# Match object details
text3 = "hello world"
match6 = re.search(r"hello", text3)
if match6:
print(f"\nMatch details:")
print(f" Matched text: {match6.group()}")
print(f" Start: {match6.start()}, End: {match6.end()}")
print(f" Span: {match6.span()}")
# Literal matching
import re
# Basic pattern matching
# Literal match
text1 =
match1 = re.match(r"hello", text1)
print(f"'hello' matches 'hello': {match1 is not None}")
match2 = re.match(r"world", text1)
print(f"'hello' matches 'world': {match2 is not None}")
# Full match vs partial match
print("\nFull match vs search:")
print(f"Full match 'hello' in 'hello world': {re.fullmatch(r'hello', 'hello world') is not None}")
print(f"Search 'hello' in 'hello world': {re.search(r'hello', 'hello world') is not None}")
# Compiled pattern
pattern = re.compile(r"hello")
match3 = pattern.search("hello world")
print(f"\nCompiled pattern found: {match3 is not None}")
# Case sensitivity
match4 = re.match(r"hello", "Hello")
print(f"\n'Hello' matches 'hello': {match4 is not None}")
# Case insensitive
match5 = re.match(r"hello", "Hello", re.IGNORECASE)
print(f"Case insensitive match: {match5 is not None}")
# Multiple occurrences
text2 = "hello hello hello"
matches = re.finditer(r"hello", text2)
print("\nFind all occurrences:")
for match in matches:
print(f" Found at index: {match.start()}")
# Match object details
text3 = "hello world"
match6 = re.search(r"hello", text3)
if match6:
print(f"\nMatch details:")
print(f" Matched text: {match6.group()}")
print(f" Start: {match6.start()}, End: {match6.end()}")
print(f" Span: {match6.span()}")
# Literal matching
import re
# Basic pattern matching
# Literal match
text1 =
match1 = re.match(r"hello", text1)
print(f"'hello' matches 'hello': {match1 is not None}")
match2 = re.match(r"world", text1)
print(f"'hello' matches 'world': {match2 is not None}")
# Full match vs partial match
print("\nFull match vs search:")
print(f"Full match 'hello' in 'hello world': {re.fullmatch(r'hello', 'hello world') is not None}")
print(f"Search 'hello' in 'hello world': {re.search(r'hello', 'hello world') is not None}")
# Compiled pattern
pattern = re.compile(r"hello")
match3 = pattern.search("hello world")
print(f"\nCompiled pattern found: {match3 is not None}")
# Case sensitivity
match4 = re.match(r"hello", "Hello")
print(f"\n'Hello' matches 'hello': {match4 is not None}")
# Case insensitive
match5 = re.match(r"hello", "Hello", re.IGNORECASE)
print(f"Case insensitive match: {match5 is not None}")
# Multiple occurrences
text2 = "hello hello hello"
matches = re.finditer(r"hello", text2)
print("\nFind all occurrences:")
for match in matches:
print(f" Found at index: {match.start()}")
# Match object details
text3 = "hello world"
match6 = re.search(r"hello", text3)
if match6:
print(f"\nMatch details:")
print(f" Matched text: {match6.group()}")
print(f" Start: {match6.start()}, End: {match6.end()}")
print(f" Span: {match6.span()}")
character_class.py
# Character classes
import re
# Character class [abc]
# Match one of specific characters
print("a" == re.fullmatch(r"[abc]", "a").group() if re.fullmatch(r"[abc]", "a") else False)
print("b" == re.fullmatch(r"[abc]", "b").group() if re.fullmatch(r"[abc]", "b") else False)
print("d matches [abc]:", re.fullmatch(r"[abc]", "d") is not None)
# Range
print("\nRanges:")
print("5 matches [0-9]:", re.fullmatch(r"[0-9]", "5") is not None)
print("m matches [a-z]:", re.fullmatch(r"[a-z]", "m") is not None)
print("M matches [A-Z]:", re.fullmatch(r"[A-Z]", "M") is not None)
print("M matches [a-z]:", re.fullmatch(r"[a-z]", "M") is not None)
# Multiple ranges
print("\nMultiple ranges:")
print("a matches [a-zA-Z]:", re.fullmatch(r"[a-zA-Z]", "a") is not None)
print("5 matches [a-zA-Z]:", re.fullmatch(r"[a-zA-Z]", "5") is not None)
print("5 matches [a-zA-Z0-9]:", re.fullmatch(r"[a-zA-Z0-9]", "5") is not None)
# Negation [^...]
print("\nNegation:")
print("a matches [^0-9]:", re.fullmatch(r"[^0-9]", "a") is not None)
print("5 matches [^0-9]:", re.fullmatch(r"[^0-9]", "5") is not None)
# Predefined character classes
print("\nPredefined classes:")
print(r"5 matches \d:", re.fullmatch(r"\d", "5") is not None)
print(r"a matches \d:", re.fullmatch(r"\d", "a") is not None)
print(r"a matches \w:", re.fullmatch(r"\w", "a") is not None)
print(r"space matches \s:", re.fullmatch(r"\s", " ") is not None)
print(r"a matches \D:", re.fullmatch(r"\D", "a") is not None)
print(r"5 matches \D:", re.fullmatch(r"\D", "5") is not None)
# Dot . matches any character
print("\nDot (any char):")
print("a matches .:", re.fullmatch(r".", "a") is not None)
print("5 matches .:", re.fullmatch(r".", "5") is not None)
print("space matches .:", re.fullmatch(r".", " ") is not None)
# Escape special chars
print("\nEscape special chars:")
print(r"\.:", re.search(r"\.", "hello.world") is not None)
print(r"literal dot at:", re.search(r"\.", "hello.world").start())
quantifiers.py
# Quantifiers
import re
# Quantifiers
# * (0 or more)
print("Asterisk * (0 or more):")
print("'' matches a*:", re.fullmatch(r"a*", "") is not None)
print("'a' matches a*:", re.fullmatch(r"a*", "a") is not None)
print("'aaa' matches a*:", re.fullmatch(r"a*", "aaa") is not None)
print("'b' matches a*:", re.fullmatch(r"a*", "b") is not None)
# + (1 or more)
print("\nPlus + (1 or more):")
print("'' matches a+:", re.fullmatch(r"a+", "") is not None)
print("'a' matches a+:", re.fullmatch(r"a+", "a") is not None)
print("'aaa' matches a+:", re.fullmatch(r"a+", "aaa") is not None)
# ? (0 or 1)
print("\nQuestion ? (0 or 1):")
print("'' matches a?:", re.fullmatch(r"a?", "") is not None)
print("'a' matches a?:", re.fullmatch(r"a?", "a") is not None)
print("'aa' matches a?:", re.fullmatch(r"a?", "aa") is not None)
# {n} (exactly n)
print("\n{n} (exactly n):")
print("'aa' matches a{2}:", re.fullmatch(r"a{2}", "aa") is not None)
print("'aaa' matches a{2}:", re.fullmatch(r"a{2}", "aaa") is not None)
print("'a' matches a{2}:", re.fullmatch(r"a{2}", "a") is not None)
# {n,} (n or more)
print("\n{n,} (n or more):")
print("'aa' matches a{2,}:", re.fullmatch(r"a{2,}", "aa") is not None)
print("'aaa' matches a{2,}:", re.fullmatch(r"a{2,}", "aaa") is not None)
print("'a' matches a{2,}:", re.fullmatch(r"a{2,}", "a") is not None)
# {n,m} (between n and m)
print("\n{n,m} (between n and m):")
print("'aa' matches a{2,4}:", re.fullmatch(r"a{2,4}", "aa") is not None)
print("'aaa' matches a{2,4}:", re.fullmatch(r"a{2,4}", "aaa") is not None)
print("'aaaa' matches a{2,4}:", re.fullmatch(r"a{2,4}", "aaaa") is not None)
print("'a' matches a{2,4}:", re.fullmatch(r"a{2,4}", "a") is not None)
print("'aaaaa' matches a{2,4}:", re.fullmatch(r"a{2,4}", "aaaaa") is not None)
# Practical: digits
print("\nPractical - validate numbers:")
print(r"'123' matches \d+:", re.fullmatch(r"\d+", "123") is not None)
print(r"'12345' matches \d{5}:", re.fullmatch(r"\d{5}", "12345") is not None)
print(r"'123' matches \d{2,4}:", re.fullmatch(r"\d{2,4}", "123") is not None)
# Greedy vs non-greedy
print("\nGreedy vs non-greedy:")
text = "<tag>content</tag>"
greedy = re.search(r"<.*>", text)
non_greedy = re.search(r"<.*?>", text)
print(f"Greedy: {greedy.group()}")
print(f"Non-greedy: {non_greedy.group()}")
anchors.py
# Anchors
import re
# Anchors ^ and $
# ^ (start of string)
print("Start anchor ^:")
print("'hello' matches ^hello:", re.match(r"^hello", "hello") is not None)
print("'hello world' matches ^hello:", re.match(r"^hello", "hello world") is not None)
print("'world hello' matches ^hello:", re.match(r"^hello", "world hello") is not None)
# $ (end of string)
print("\nEnd anchor $:")
print("'hello' matches hello$:", re.search(r"hello$", "hello") is not None)
print("'hello world' matches world$:", re.search(r"world$", "hello world") is not None)
print("'world hello' matches world$:", re.search(r"world$", "world hello") is not None)
# Both anchors
print("\nBoth anchors ^...$:")
print("'hello' matches ^hello$:", re.fullmatch(r"^hello$", "hello") is not None)
print("'hello world' matches ^hello$:", re.fullmatch(r"^hello$", "hello world") is not None)
# \b (word boundary)
print(r"\nWord boundary \b:")
text = "hello world"
print(f"Text: '{text}'")
print(r"\bhello\b found:", re.search(r"\bhello\b", text) is not None)
print(r"\bworld\b found:", re.search(r"\bworld\b", text) is not None)
print(r"\bhello\b in 'helloworld':", re.search(r"\bhello\b", "helloworld") is not None)
# Practical examples
print("\nPractical validation:")
# Must start with letter
print("'abc123' starts with letter:", re.match(r"^[a-zA-Z]", "abc123") is not None)
print("'123abc' starts with letter:", re.match(r"^[a-zA-Z]", "123abc") is not None)
# Must end with digit
print(r"'abc123' ends with digit:", re.search(r"\d$", "abc123") is not None)
print(r"'abc' ends with digit:", re.search(r"\d$", "abc") is not None)
# Exact length
print(r"'12345' is exactly 5 digits:", re.fullmatch(r"^\d{5}$", "12345") is not None)
print(r"'1234' is exactly 5 digits:", re.fullmatch(r"^\d{5}$", "1234") is not None)
# Multiple words
text2 = "The quick brown fox"
words = re.findall(r"\b\w+\b", text2)
print(f"\nWords in '{text2}':")
for word in words:
print(f" {word}")
# Line anchors with multiline
text3 = """line1
line2
line3"""
print("\nMultiline mode:")
matches = re.findall(r"^line", text3, re.MULTILINE)
print(f"Lines starting with 'line': {matches}")
groups.py
# Groups and capturing
import re
# Groups with ()
# Basic grouping
date = "2025-01-29"
pattern = re.compile(r"(\d{4})-(\d{2})-(\d{2})")
match = pattern.match(date)
if match:
print(f"Full match: {match.group(0)}")
print(f"Year: {match.group(1)}")
print(f"Month: {match.group(2)}")
print(f"Day: {match.group(3)}")
# Email parsing
email = "user@example.com"
email_pattern = re.compile(r"([^@]+)@([^@]+)")
email_match = email_pattern.match(email)
if email_match:
print("\nEmail parts:")
print(f"Username: {email_match.group(1)}")
print(f"Domain: {email_match.group(2)}")
# Phone number
phone = "(555) 123-4567"
phone_pattern = re.compile(r"\((\d{3})\)\s(\d{3})-(\d{4})")
phone_match = phone_pattern.match(phone)
if phone_match:
print("\nPhone parts:")
print(f"Area: {phone_match.group(1)}")
print(f"Exchange: {phone_match.group(2)}")
print(f"Number: {phone_match.group(3)}")
# Multiple matches
text = "Call me at 555-1234 or 555-5678"
num_pattern = re.compile(r"(\d{3})-(\d{4})")
num_matches = num_pattern.finditer(text)
print("\nAll phone numbers:")
for match in num_matches:
print(f" {match.group(0)} (Area: {match.group(1)}, Num: {match.group(2)})")
# Named groups
url = "https://example.com"
url_pattern = re.compile(r"(?P<protocol>https?)://(?P<domain>.+)")
url_match = url_pattern.match(url)
if url_match:
print("\nURL parts (named groups):")
print(f"Protocol: {url_match.group('protocol')}")
print(f"Domain: {url_match.group('domain')}")
# Non-capturing group (?:...)
text2 = "color: red; colour: blue"
# Match both spellings but don't capture the 'u'
pattern2 = re.compile(r"colou?r:\s(\w+)")
matches2 = pattern2.findall(text2)
print(f"\nColors: {matches2}")
# Groups with findall
text3 = "2025-01-29 and 2024-12-25"
dates = re.findall(r"(\d{4})-(\d{2})-(\d{2})", text3)
print("\nAll dates (as tuples):")
for year, month, day in dates:
print(f" {year}/{month}/{day}")
# Backreferences
text4 = "hello hello"
# \1 refers to first group
duplicate = re.search(r"(\w+)\s\1", text4)
if duplicate:
print(f"\nDuplicate word found: {duplicate.group(1)}")
Common Functions
re.match(): Match at startre.search(): Find pattern anywherere.findall(): Find all matchesre.sub(): Replace matches
literal_match
Basic exact text matching with re.match(), re.search(), and re.fullmatch()
character_class
Matching sets of characters with [abc], [a-z], and predefined classes like \\d and \\w
quantifiers
Specifying repetition with *, +, ?, and {n,m}
anchors
Matching positions with ^ (start), $ (end), and \\b (word boundary)
groups
Capturing and extracting parts of matches with parentheses
Exercise: practical.py
Build validators for usernames, emails, phones, and URLs using regex