Text data often needs cleaning, parsing, and transformation before use. Python strings have built-in methods for splitting CSV data, removing whitespace, validating user input, and searching for patterns. These methods form the foundation of text processing in Python.

Prefix and Suffix Checking

split_join.py
# Split and join


# Split string into list
text = 
fruits = text.split(",")
print("Split by comma:", fruits)

sentence = "The quick brown fox"
words = sentence.split()  # Default: split by whitespace
print("Split by space:", words)

multiline = "Line1\nLine2\nLine3"
lines = multiline.split("\n")
print("Split by newline:", lines)

# Split with limit
data = "a:b:c:d:e"
parts = data.split(":", 2)  # Split into max 3 parts
print("Limited split:", parts)

# Join list into string
# Join with separator
fruits_joined = ", ".join(fruits)
print("\nJoined:", fruits_joined)

# Join with different separator
path = "/".join(["home", "user", "documents", "file.txt"])
print("Path:", path)

# Join words with space
words_joined = " ".join(words)
print("Sentence:", words_joined)

# Join empty separator
chars = ["H", "e", "l", "l", "o"]
word = "".join(chars)
print("Word:", word)

# Split and join


# Split string into list
text = 
fruits = text.split(",")
print("Split by comma:", fruits)

sentence = "The quick brown fox"
words = sentence.split()  # Default: split by whitespace
print("Split by space:", words)

multiline = "Line1\nLine2\nLine3"
lines = multiline.split("\n")
print("Split by newline:", lines)

# Split with limit
data = "a:b:c:d:e"
parts = data.split(":", 2)  # Split into max 3 parts
print("Limited split:", parts)

# Join list into string
# Join with separator
fruits_joined = ", ".join(fruits)
print("\nJoined:", fruits_joined)

# Join with different separator
path = "/".join(["home", "user", "documents", "file.txt"])
print("Path:", path)

# Join words with space
words_joined = " ".join(words)
print("Sentence:", words_joined)

# Join empty separator
chars = ["H", "e", "l", "l", "o"]
word = "".join(chars)
print("Word:", word)

# Split and join


# Split string into list
text = 
fruits = text.split(",")
print("Split by comma:", fruits)

sentence = "The quick brown fox"
words = sentence.split()  # Default: split by whitespace
print("Split by space:", words)

multiline = "Line1\nLine2\nLine3"
lines = multiline.split("\n")
print("Split by newline:", lines)

# Split with limit
data = "a:b:c:d:e"
parts = data.split(":", 2)  # Split into max 3 parts
print("Limited split:", parts)

# Join list into string
# Join with separator
fruits_joined = ", ".join(fruits)
print("\nJoined:", fruits_joined)

# Join with different separator
path = "/".join(["home", "user", "documents", "file.txt"])
print("Path:", path)

# Join words with space
words_joined = " ".join(words)
print("Sentence:", words_joined)

# Join empty separator
chars = ["H", "e", "l", "l", "o"]
word = "".join(chars)
print("Word:", word)

strip_replace.py
# Strip and replace


# Strip whitespace
text1 = "   Hello World   "
print("Original:", repr(text1))
print("strip():", repr(text1.strip()))
print("lstrip():", repr(text1.lstrip()))
print("rstrip():", repr(text1.rstrip()))

# Strip specific characters
text2 = "***Hello***"
print("\nStrip asterisks:", text2.strip("*"))

text3 = "...Hello..."
print("Strip dots:", text3.strip("."))

# Replace substring
sentence = "I love Java"
updated = sentence.replace("Java", "Python")
print("\nReplace:", updated)

# Replace all occurrences
text4 = "Hello Hello Hello"
replaced = text4.replace("Hello", "Hi")
print("Replace all:", replaced)

# Replace with count limit
text5 = "one one one one"
replaced_limit = text5.replace("one", "two", 2)
print("Replace 2:", replaced_limit)

# Remove substring (replace with empty)
text6 = "Hello, World!"
no_comma = text6.replace(",", "")
print("Remove comma:", no_comma)

# Case conversion
text = "Hello World"
print("\nUpper:", text.upper())
print("Lower:", text.lower())
print("Title:", text.title())
print("Capitalize:", text.capitalize())
print("Swapcase:", text.swapcase())

find_index.py
# Find and index


# Find substring
text = "Hello World Hello"

# find() returns index or -1
pos1 = text.find("World")
print(f"find('World'): {pos1}")

pos2 = text.find("Python")
print(f"find('Python'): {pos2}")  # -1 not found

# Find from specific position
pos3 = text.find("Hello", 1)  # Start search at index 1
print(f"find('Hello', 1): {pos3}")  # Finds second occurrence

# rfind() searches from right
pos4 = text.rfind("Hello")
print(f"rfind('Hello'): {pos4}")  # Last occurrence

# Index (raises exception if not found)
try:
    pos5 = text.index("World")
    print(f"\nindex('World'): {pos5}")

    # This will raise ValueError
    pos6 = text.index("Python")
except ValueError as e:
    print(f"Error: {e}")

# Count occurrences
count1 = text.count("Hello")
print(f"\ncount('Hello'): {count1}")

count2 = text.count("o")
print(f"count('o'): {count2}")

count3 = text.count("xyz")
print(f"count('xyz'): {count3}")

# Practical: check if substring exists
email = "user@example.com"

if "@" in email and "." in email:
    print(f"\n'{email}' looks like an email")

# Using find
if email.find("@") != -1:
    at_pos = email.find("@")
    domain = email[at_pos + 1:]
    print(f"Domain: {domain}")

validation.py
# String validation methods


# Digit checking
age_str = "25"
invalid_age = "25.5"

print(f"'{age_str}'.isdigit(): {age_str.isdigit()}")
print(f"'{invalid_age}'.isdigit(): {invalid_age.isdigit()}")

# Validate numeric input
inputs = ["123", "abc", "12.34", "-5", ""]

print("\nDigit validation:")
for inp in inputs:
    if inp.isdigit():
        print(f"  '{inp}' is valid positive integer")
    else:
        print(f"  '{inp}' is NOT a digit string")

# Alpha checking
name = "John"
mixed = "John123"

print(f"\n'{name}'.isalpha(): {name.isalpha()}")
print(f"'{mixed}'.isalpha(): {mixed.isalpha()}")

# Check if valid name (letters only)
names = ["Alice", "Bob123", "Charlie-", "Diana"]

print("\nValid names (letters only):")
for n in names:
    if n.isalpha():
        print(f"  {n}")

# Alphanumeric checking
username = "user123"
invalid_user = "user@123"

print(f"\n'{username}'.isalnum(): {username.isalnum()}")
print(f"'{invalid_user}'.isalnum(): {invalid_user.isalnum()}")

# Case checking
text1 = "HELLO"
text2 = "hello"
text3 = "Hello"

print(f"\n'{text1}'.isupper(): {text1.isupper()}")
print(f"'{text2}'.islower(): {text2.islower()}")
print(f"'{text3}'.istitle(): {text3.istitle()}")

# Space checking
spaces = "   "
empty = ""
text = "Hello"

print(f"\n'{spaces}'.isspace(): {spaces.isspace()}")
print(f"'{empty}'.isspace(): {empty.isspace()}")
print(f"'{text}'.isspace(): {text.isspace()}")

# Practical: validate username
def validate_username(username):
    """Validate username: 3-20 chars, alphanumeric"""
    if len(username) < 3 or len(username) > 20:
        return False, "Length must be 3-20 characters"

    if not username.isalnum():
        return False, "Only letters and numbers allowed"

    if username.isdigit():
        return False, "Cannot be all digits"

    return True, "Valid username"


test_users = ["alice", "bob123", "a", "user@name", "12345", "VeryLongUsernameThatExceedsLimit"]

print("\nUsername validation:")
for user in test_users:
    valid, message = validate_username(user)
    status = "✓" if valid else "✗"
    print(f"  {status} '{user}': {message}")

startswith_endswith.py
# Startswith and endswith


# Check prefix
filename = "document.txt"

if filename.startswith("doc"):
    print(f"'{filename}' starts with 'doc'")

if filename.startswith("image"):
    print("This won't print")
else:
    print(f"'{filename}' doesn't start with 'image'")

# Check multiple prefixes
url = "https://example.com"
if url.startswith(("http://", "https://")):
    print(f"'{url}' is a valid URL")

# Check suffix
files = ["report.pdf", "data.csv", "image.png", "script.py"]

print("\nPython files:")
for file in files:
    if file.endswith(".py"):
        print(f"  {file}")

print("\nData files:")
for file in files:
    if file.endswith((".csv", ".json", ".xml")):
        print(f"  {file}")

# Practical: file type detection
def get_file_type(filename):
    """Determine file type by extension"""
    if filename.endswith((".jpg", ".png", ".gif")):
        return "Image"
    elif filename.endswith((".txt", ".md", ".log")):
        return "Text"
    elif filename.endswith((".py", ".java", ".js")):
        return "Code"
    elif filename.endswith((".pdf", ".doc", ".docx")):
        return "Document"
    else:
        return "Unknown"


test_files = ["photo.jpg", "script.py", "README.md", "report.pdf", "data.db"]

print("\nFile types:")
for file in test_files:
    print(f"  {file}: {get_file_type(file)}")

# Remove extension
def remove_extension(filename):
    """Remove file extension"""
    if "." in filename:
        dot_pos = filename.rfind(".")
        return filename[:dot_pos]
    return filename


print("\nWithout extensions:")
for file in test_files:
    print(f"  {file} → {remove_extension(file)}")

Characteristics

  • Immutable: Methods return new strings
  • Chainable: Can chain method calls
  • Unicode-aware: Work with international text
  • Many methods: Rich standard library
split_join Breaking strings into lists and combining lists into strings
strip_replace Removing whitespace and substituting substrings
find_search Finding substrings and their positions in strings
validation Checking string content with isdigit(), isalpha(), and related methods
prefix_suffix Checking if strings start or end with specific substrings

Exercise: practical.py

Build a text processing pipeline that cleans user input, parses CSV data, and formats phone numbers