String Processing
String Methods
Text data often needs cleaning, parsing, and transformation before use. Python strings have built-in methods for splitting CSV data, removing whitespace, validating user input, and searching for patterns. These methods form the foundation of text processing in Python.
Prefix and Suffix Checking
split_join.py
# Split and join
# Split string into list
text =
fruits = text.split(",")
print("Split by comma:", fruits)
sentence = "The quick brown fox"
words = sentence.split() # Default: split by whitespace
print("Split by space:", words)
multiline = "Line1\nLine2\nLine3"
lines = multiline.split("\n")
print("Split by newline:", lines)
# Split with limit
data = "a:b:c:d:e"
parts = data.split(":", 2) # Split into max 3 parts
print("Limited split:", parts)
# Join list into string
# Join with separator
fruits_joined = ", ".join(fruits)
print("\nJoined:", fruits_joined)
# Join with different separator
path = "/".join(["home", "user", "documents", "file.txt"])
print("Path:", path)
# Join words with space
words_joined = " ".join(words)
print("Sentence:", words_joined)
# Join empty separator
chars = ["H", "e", "l", "l", "o"]
word = "".join(chars)
print("Word:", word)
# Split and join
# Split string into list
text =
fruits = text.split(",")
print("Split by comma:", fruits)
sentence = "The quick brown fox"
words = sentence.split() # Default: split by whitespace
print("Split by space:", words)
multiline = "Line1\nLine2\nLine3"
lines = multiline.split("\n")
print("Split by newline:", lines)
# Split with limit
data = "a:b:c:d:e"
parts = data.split(":", 2) # Split into max 3 parts
print("Limited split:", parts)
# Join list into string
# Join with separator
fruits_joined = ", ".join(fruits)
print("\nJoined:", fruits_joined)
# Join with different separator
path = "/".join(["home", "user", "documents", "file.txt"])
print("Path:", path)
# Join words with space
words_joined = " ".join(words)
print("Sentence:", words_joined)
# Join empty separator
chars = ["H", "e", "l", "l", "o"]
word = "".join(chars)
print("Word:", word)
# Split and join
# Split string into list
text =
fruits = text.split(",")
print("Split by comma:", fruits)
sentence = "The quick brown fox"
words = sentence.split() # Default: split by whitespace
print("Split by space:", words)
multiline = "Line1\nLine2\nLine3"
lines = multiline.split("\n")
print("Split by newline:", lines)
# Split with limit
data = "a:b:c:d:e"
parts = data.split(":", 2) # Split into max 3 parts
print("Limited split:", parts)
# Join list into string
# Join with separator
fruits_joined = ", ".join(fruits)
print("\nJoined:", fruits_joined)
# Join with different separator
path = "/".join(["home", "user", "documents", "file.txt"])
print("Path:", path)
# Join words with space
words_joined = " ".join(words)
print("Sentence:", words_joined)
# Join empty separator
chars = ["H", "e", "l", "l", "o"]
word = "".join(chars)
print("Word:", word)
strip_replace.py
# Strip and replace
# Strip whitespace
text1 = " Hello World "
print("Original:", repr(text1))
print("strip():", repr(text1.strip()))
print("lstrip():", repr(text1.lstrip()))
print("rstrip():", repr(text1.rstrip()))
# Strip specific characters
text2 = "***Hello***"
print("\nStrip asterisks:", text2.strip("*"))
text3 = "...Hello..."
print("Strip dots:", text3.strip("."))
# Replace substring
sentence = "I love Java"
updated = sentence.replace("Java", "Python")
print("\nReplace:", updated)
# Replace all occurrences
text4 = "Hello Hello Hello"
replaced = text4.replace("Hello", "Hi")
print("Replace all:", replaced)
# Replace with count limit
text5 = "one one one one"
replaced_limit = text5.replace("one", "two", 2)
print("Replace 2:", replaced_limit)
# Remove substring (replace with empty)
text6 = "Hello, World!"
no_comma = text6.replace(",", "")
print("Remove comma:", no_comma)
# Case conversion
text = "Hello World"
print("\nUpper:", text.upper())
print("Lower:", text.lower())
print("Title:", text.title())
print("Capitalize:", text.capitalize())
print("Swapcase:", text.swapcase())
find_index.py
# Find and index
# Find substring
text = "Hello World Hello"
# find() returns index or -1
pos1 = text.find("World")
print(f"find('World'): {pos1}")
pos2 = text.find("Python")
print(f"find('Python'): {pos2}") # -1 not found
# Find from specific position
pos3 = text.find("Hello", 1) # Start search at index 1
print(f"find('Hello', 1): {pos3}") # Finds second occurrence
# rfind() searches from right
pos4 = text.rfind("Hello")
print(f"rfind('Hello'): {pos4}") # Last occurrence
# Index (raises exception if not found)
try:
pos5 = text.index("World")
print(f"\nindex('World'): {pos5}")
# This will raise ValueError
pos6 = text.index("Python")
except ValueError as e:
print(f"Error: {e}")
# Count occurrences
count1 = text.count("Hello")
print(f"\ncount('Hello'): {count1}")
count2 = text.count("o")
print(f"count('o'): {count2}")
count3 = text.count("xyz")
print(f"count('xyz'): {count3}")
# Practical: check if substring exists
email = "user@example.com"
if "@" in email and "." in email:
print(f"\n'{email}' looks like an email")
# Using find
if email.find("@") != -1:
at_pos = email.find("@")
domain = email[at_pos + 1:]
print(f"Domain: {domain}")
validation.py
# String validation methods
# Digit checking
age_str = "25"
invalid_age = "25.5"
print(f"'{age_str}'.isdigit(): {age_str.isdigit()}")
print(f"'{invalid_age}'.isdigit(): {invalid_age.isdigit()}")
# Validate numeric input
inputs = ["123", "abc", "12.34", "-5", ""]
print("\nDigit validation:")
for inp in inputs:
if inp.isdigit():
print(f" '{inp}' is valid positive integer")
else:
print(f" '{inp}' is NOT a digit string")
# Alpha checking
name = "John"
mixed = "John123"
print(f"\n'{name}'.isalpha(): {name.isalpha()}")
print(f"'{mixed}'.isalpha(): {mixed.isalpha()}")
# Check if valid name (letters only)
names = ["Alice", "Bob123", "Charlie-", "Diana"]
print("\nValid names (letters only):")
for n in names:
if n.isalpha():
print(f" {n}")
# Alphanumeric checking
username = "user123"
invalid_user = "user@123"
print(f"\n'{username}'.isalnum(): {username.isalnum()}")
print(f"'{invalid_user}'.isalnum(): {invalid_user.isalnum()}")
# Case checking
text1 = "HELLO"
text2 = "hello"
text3 = "Hello"
print(f"\n'{text1}'.isupper(): {text1.isupper()}")
print(f"'{text2}'.islower(): {text2.islower()}")
print(f"'{text3}'.istitle(): {text3.istitle()}")
# Space checking
spaces = " "
empty = ""
text = "Hello"
print(f"\n'{spaces}'.isspace(): {spaces.isspace()}")
print(f"'{empty}'.isspace(): {empty.isspace()}")
print(f"'{text}'.isspace(): {text.isspace()}")
# Practical: validate username
def validate_username(username):
"""Validate username: 3-20 chars, alphanumeric"""
if len(username) < 3 or len(username) > 20:
return False, "Length must be 3-20 characters"
if not username.isalnum():
return False, "Only letters and numbers allowed"
if username.isdigit():
return False, "Cannot be all digits"
return True, "Valid username"
test_users = ["alice", "bob123", "a", "user@name", "12345", "VeryLongUsernameThatExceedsLimit"]
print("\nUsername validation:")
for user in test_users:
valid, message = validate_username(user)
status = "✓" if valid else "✗"
print(f" {status} '{user}': {message}")
startswith_endswith.py
# Startswith and endswith
# Check prefix
filename = "document.txt"
if filename.startswith("doc"):
print(f"'{filename}' starts with 'doc'")
if filename.startswith("image"):
print("This won't print")
else:
print(f"'{filename}' doesn't start with 'image'")
# Check multiple prefixes
url = "https://example.com"
if url.startswith(("http://", "https://")):
print(f"'{url}' is a valid URL")
# Check suffix
files = ["report.pdf", "data.csv", "image.png", "script.py"]
print("\nPython files:")
for file in files:
if file.endswith(".py"):
print(f" {file}")
print("\nData files:")
for file in files:
if file.endswith((".csv", ".json", ".xml")):
print(f" {file}")
# Practical: file type detection
def get_file_type(filename):
"""Determine file type by extension"""
if filename.endswith((".jpg", ".png", ".gif")):
return "Image"
elif filename.endswith((".txt", ".md", ".log")):
return "Text"
elif filename.endswith((".py", ".java", ".js")):
return "Code"
elif filename.endswith((".pdf", ".doc", ".docx")):
return "Document"
else:
return "Unknown"
test_files = ["photo.jpg", "script.py", "README.md", "report.pdf", "data.db"]
print("\nFile types:")
for file in test_files:
print(f" {file}: {get_file_type(file)}")
# Remove extension
def remove_extension(filename):
"""Remove file extension"""
if "." in filename:
dot_pos = filename.rfind(".")
return filename[:dot_pos]
return filename
print("\nWithout extensions:")
for file in test_files:
print(f" {file} → {remove_extension(file)}")
Characteristics
- Immutable: Methods return new strings
- Chainable: Can chain method calls
- Unicode-aware: Work with international text
- Many methods: Rich standard library
split_join
Breaking strings into lists and combining lists into strings
strip_replace
Removing whitespace and substituting substrings
find_search
Finding substrings and their positions in strings
validation
Checking string content with isdigit(), isalpha(), and related methods
prefix_suffix
Checking if strings start or end with specific substrings
Exercise: practical.py
Build a text processing pipeline that cleans user input, parses CSV data, and formats phone numbers