Regular Expressions in Python

Manish Patel

IMPORT regex

import re

REGEX VISUALIZER

REGEX OPERATIONS

regex operations

findall() function

  • This is a built-in function of the ‘re;’ module that handles the regular expression.
re.findall(pattern, string, flags=0)
  • Pattern is the regular expression.
  • String is the input string provided by the user.
  • Flags are used to modify the standard pattern behavior.
  • Each string is evaluated from left to right and finds all the matches of the pattern within the string. However, the result depends on the pattern.
  • If the pattern has no capturing groups, the findall() function returns a list of strings that match the whole pattern.
  • If the pattern has one capturing group, the findall() function returns a list of strings that match the group.
  • If the pattern has multiple capturing groups, the findall() function returns the tuples of strings that match the groups.
  • It’s important to note that the non-capturing groups do not affect the form of the return result.

findall

PATTERN

import re
s = 'I am python'
pattern = "py"
matches = re.findall(pattern,s)
print(matches)
['py']

METACHARACTERS

metacharacters

[]

import re
txt = "The rain in Spain"
x = re.findall("[a-m]", txt)
print(x)
['h', 'e', 'a', 'i', 'i', 'a', 'i']
import re
txt = "The rain in Spain"
x = re.findall("[^a-m,' ']", txt)
print(x)
['T', 'r', 'n', 'n', 'S', 'p', 'n']

\

import re
txt = "That will be 59 dollars"
#Find all digit characters:
x = re.findall("\d", txt)
print(x)
['5', '9']

.

import re
txt = "hello planet hem9o he o"
#Search for a sequence that starts with "he", followed by two (any) 
#characters, and an "o":
x = re.findall("he..o", txt)
print(x)
['hello', 'hem9o']

^ Caret (Start With)

import re
txt = "hello54 hello65 planet"
#Check if the string starts with 'hello':
x = re.findall("^hello6", txt)
if x:
 print("Yes, the string starts with 'hello'")
 print(x)
else:
 print("No match")
No match

$ Dollar (end with)

import re
txt = "hello planet"
#Check if the string ends with 'planet':
x = re.findall("planet$", txt)
if x:
 print("Yes, the string ends with 'planet'")
else:
 print("No match")
Yes, the string ends with 'planet'

*

import re
txt = "hello planet"
#Search for a sequence that starts with "he", followed by 0 or more (any)
#characters, and an "o":
x = re.findall("he.*o", txt)
print(x)
['hello']

+

import re
txt = "hello planet"
#Search for a sequence that starts with "he", followed by 1 or more (any)
#characters, and an "o":
x = re.findall("he.+o", txt)
print(x)
['hello']

?

import re
txt = "hello planet"
#Search for a sequence that starts with "he", followed by 0 or 1 (any)
#character, and an "o":
x = re.findall("he.?o", txt)
print(x)
[]

|

import re
txt = "The rain in Spain falls mainly in the plain!"
#Check if the string contains either "falls" or "stays":
x = re.findall("falls|stays", txt)
print(x)
if x:
     print("Yes, there is at least one match!")
else:
     print("No match")
['falls']
Yes, there is at least one match!

() Group

import re
email = "john@example.com"
pattern = r"([a-z]+)@([a-z]+)\.com"
# Apply the pattern and extract the capture groups
match = re.match(pattern, email)
print(match.group())
if match:
 username = match.group(1)
 domain = match.group(2)
 print("Username:", username)
 print("Domain:", domain)
else:
 print("Email address is not valid")
john@example.com
Username: john
Domain: example

SPECIAL SEQUENCES

special sequences

CHARACTER CLASS

characterclass

\A

Returns a match if the specified characters are at the beginning of the string

import re
txt = "The rain in Spain"
#Check if the string starts with "The":
x = re.findall("\AThe", txt)
print(x)
if x:
 print("Yes, there is a match!")
else:
 print("No match")
['The']
Yes, there is a match!

\b

Returns a match where the specified characters are at the beginning or at the end of a word (the “r” in the beginning ensures the string is being treated as a “raw string”)

#Check if "ain" is present at the end of a WORD:
x = re.findall(r"ain\b", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
['ain', 'ain']
Yes, there is at least one match!

\B

Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word (the “r” in the beginning is making sure that the string is being treated as a “raw string”)

txt = "The rain in Spain"
#Check if "ain" is present, but NOT at the beginning of a word:
x = re.findall(r"\Bain", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
['ain', 'ain']
Yes, there is at least one match!

\d

Returns a match where the string contains digits (numbers from 0-9)

import re
txt = "The rain in Spain3"
#Check if the string contains any digits (numbers from 0-9):
x = re.findall("\d", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
['3']
Yes, there is at least one match!

\D

Returns a match where the string DOES NOT contain digits

import re
txt = "The rain in 33 Spain"
#Return a match at every no-digit character:
x = re.findall("\D", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
['T', 'h', 'e', ' ', 'r', 'a', 'i', 'n', ' ', 'i', 'n', ' ', ' ', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!

\s

Returns a match where the string contains a white space character

import re
txt = "The rain in Spain"
#Return a match at every white-space character:
x = re.findall("\s", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
[' ', ' ', ' ']
Yes, there is at least one match!

\S

Returns a match where the string DOES NOT contain a white space character

import re
txt = "The rain in Spain"
#Return a match at every NON white-space character:
x = re.findall("\S", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n']
Yes, there is at least one match!

\w

Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)

import re
txt = "The rain in Spain 35 _"
#Return a match at evry word character (characters from a to Z, digits from
# 0-9, and the underscore _ character):
x = re.findall("\w", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
['T', 'h', 'e', 'r', 'a', 'i', 'n', 'i', 'n', 'S', 'p', 'a', 'i', 'n', '3', '5', '_']
Yes, there is at least one match!

\W

Returns a match where the string DOES NOT contain any word characters

import re
txt = "The rain in % Spain"
#Return a match at every NON word character (characters NOT between a and
#Z. Like "!", "?" white-space etc.):
x = re.findall("\W", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
[' ', ' ', ' ', '%', ' ']
Yes, there is at least one match!

\Z

Returns a match if the specified characters are at the end of the string

import re
txt = "Te rain in Spain"
#Check if the string ends with "Spain":
x = re.findall("Spain\Z", txt)
print(x)
if x:
 print("Yes, there is a match!")
else:
 print("No match")
['Spain']
Yes, there is a match!

Sets

This is a set of characters enclosed in square brackets [] with a special meaning.

[arn]

This will return a match where one of the specified characters (a, r, or n) are present.

import re
txt = "The rain in Spain"
#Check if the string has any a, r, or n characters:
x = re.findall("[arn]", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
['r', 'a', 'n', 'n', 'a', 'n']
Yes, there is at least one match!

[a-n]

This will return a match for any lower case character, alphabetically between a and n. ]

import re
txt = "The rain in Spain"
#Check if the string has any characters between a and n:
x = re.findall("[a-n]", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
['h', 'e', 'a', 'i', 'n', 'i', 'n', 'a', 'i', 'n']
Yes, there is at least one match!

[^arn]

This will return a match for any character EXCEPT a, r, and n.

import re
txt = "The rain in Spain"
#Check if the string has other characters than a, r, or n:
x = re.findall("[^arn]", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
['T', 'h', 'e', ' ', 'i', ' ', 'i', ' ', 'S', 'p', 'i']
Yes, there is at least one match!

[0123]

This will return a match where any of the specified digits (0, 1, 2, or 3) are present.

import re
txt = "The rain3 in Spain"
#Check if the string has any 0, 1, 2, or 3 digits:
x = re.findall("[0123]", txt)
print(x)
if x:
 print("Yes, there is at least one match!")
else:
 print("No match")
#[0-9]
['3']
Yes, there is at least one match!

[0-5][0-9]

This will return a match for any two-digit numbers from 00 and 59.

import re
txt = "8 times before 11:45 AM"
#Check if the string has any two-digit numbers, from 00 to 59:
x = re.findall("[0-5][0-9]", txt)
print(x)
['11', '45']

[a-zA-Z]

This will return a match for any character alphabetically between a and z, lower case OR upper case.

import re
txt = "8 times before 11:45 AM"
#Check if the string has any characters from a to z lower case, and A to Z
#upper case:
x = re.findall("[a-zA-Z]", txt)
print(x)
['t', 'i', 'm', 'e', 's', 'b', 'e', 'f', 'o', 'r', 'e', 'A', 'M']

+

In sets, +, *, ., |, (), $,{} has no special meaning. So, + means: return a match for any + character in the string.

import re
txt = "8 times before+ 11:45 AM"
#Check if the string has any + characters:
x = re.findall("[+]", txt)
print(x)
['+']

PATTERN WITH SINGLE GROUP

import re
s = "black, blue and brown"
pattern = r'bl(\w+)'
matches = re.findall(pattern,s)
print(matches)
['ack', 'ue']

Pattern with multiple groups

import re
s = "black, blue and brown"
pattern = r'(bl(\w+))'
matches = re.findall(pattern,s)
print(matches)
[('black', 'ack'), ('blue', 'ue')]

Using regular expression flag

import re
s = "Black, blue and brown"
pattern = r'(bl(\w+))'
matches = re.findall(pattern, s, re.IGNORECASE)
print(matches)
[('Black', 'ack'), ('blue', 'ue')]

finditer() function

  • Using this function, you can match a pattern in a string and returns an iterator yielding the matched objects of all non-overlapping matches.
  • Syntax:
    • re.finditer(pattern, string, flags=0)
  • Pattern is the regular expression.
  • String is the input string provided by the user.
  • Flag is optional and by default is zero. It accepts one or more RegEx flags. The flags parameter changes how the RegEx ngine matches the pattern.

finditer

EXAMPLE

import re
s = 'Readability counts.'
pattern = r'[aeoui]'
matches = re.finditer(pattern, s)
for match in matches:
 print(match)
<re.Match object; span=(1, 2), match='e'>
<re.Match object; span=(2, 3), match='a'>
<re.Match object; span=(4, 5), match='a'>
<re.Match object; span=(6, 7), match='i'>
<re.Match object; span=(8, 9), match='i'>
<re.Match object; span=(13, 14), match='o'>
<re.Match object; span=(14, 15), match='u'>

search() Function

The search() function scans the string from left to right and finds the first location where the pattern produces a match. It returns a Match object if the search was successful or None otherwise. re.search(pattern, string, flags=0)

  • Pattern is the regular expression.
  • String is the input string provided by the user.
  • Flags are used to modify the standard pattern behavior of the pattern.

image.png

EXAMPLE

import re
s = 'Python 3 was released on Dec 3, 2008'
pattern = '\d+'
match = re.search(pattern, s)
if match is not None:
 print(match.group())
else:
 print('No match found')
3

First word match

import re
s = 'CPython, IronPython, or Cython'
pattern = r'\b((\w+)thon)\b'
match = re.search(pattern, s)
if match is not None:
 print(match.groups())

# The pattern r'\b((\w+)thon)\b' has two capturing groups:
# ● (\w+) – captures the characters at the beginning of the word.
# ● ((\w+)thon) – captures the whole word.
('CPython', 'CPy')

fullmatch() function

  • This function will return a match object if the whole string matches a regular expression’s search pattern, or none otherwise.

  • Syntax: re.fullmatch(pattern, string, flags=0)

  • Pattern is the regular expression.

  • String is the input string provided by the user.

  • Flag is optional and by default is zero. It accepts one or more RegEx flags. The flags parameter changes how the RegEx engine matches the pattern.

EMAIL VALIDATION

email valid

import re
email = 'no-reply@pythontutorial.net'
pattern = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
match = re.fullmatch(pattern, email)
if match is not None:
 print(f'The email "{match.group()}" is valid')
else:
 print(f'The email "{email}"" is not valid')
The email "no-reply@pythontutorial.net" is valid

Match() Function

  • The match function of the re module allows you to search for a pattern at the beginning of the string.
  • Syntax:
    re.match(pattern, string, flags=0)
  • Pattern is the regular expression.
  • String is the input string provided by the user.
  • Flags are used to modify the standard behavior of the pattern.

match

rematch

import re
s = '3 pieces cost 5 USD'
pattern = r'\d{1}'
match = re.match(pattern, s)
if match is not None:
 print(f'The string starts with a digit {match.group()}')
The string starts with a digit 3

sub()

re.sub(pattern, repl, string, count=0, flags=0) - Pattern is a regular expression or Pattern object. - Repl is the replacement. - String is the input string provided by the user. - Count parameter specifies the maximum number of matches that the sub() function should replace. If you pass zero or skip it, the sub()function will replace all the matches. - Flags is one or more RegEx flags to modify the standard pattern behaviour

sub

To turn the phone number (212)-456-7890 into 2124567890

import re
phone_no = '(212)-456-7890'
pattern = '\D'
result = re.sub(pattern, '',phone_no)
print(result)
2124567890

escape() function

  • This function will return a string with all non-alphanumerics backslashes. This is useful if you want to match an arbitrary literal string that may have regular expression metacharacters in it.
  • Syntax:
    re.escape(string)

escape

import re
print(re.escape("This is Awesome even 1 AM"))
print(re.escape("I Asked what is this [a-9], he said \t ^WoW"))
This\ is\ Awesome\ even\ 1\ AM
I\ Asked\ what\ is\ this\ \[a\-9\],\ he\ said\ \    \ \^WoW

Compile() Function

  • This function will compile the regular expressions into pattern objects, which have methods for various operations such as searching for pattern matches or performing string substitutions.
  • Syntax:
    re.compile(string)

compile

EXAMPLE

import re
p = re.compile('[a-e]')
# findall() searches for the Regular Expression
# and return a list upon finding
print(p.findall("Aye, said Mr. Gibenson Stark"))
['e', 'a', 'd', 'b', 'e', 'a']

Split() Function

  • It splits a string by the matches of a regular expression.
  • Syntax:
    split(pattern, string, maxsplit=0, flags=0)
  • Pattern is the regular expression.
  • String is the input string provided by the user.
  • Flag is optional and by default is zero. It accepts one or more RegEx flags. The flags parameter changes how the RegEx engine matches the pattern.
  • maxsplit determines at most the splits occur. Generally, if the maxsplit is one, theresulting list will have two elements. If the maxsplit is two, the resulting list will have three elements, and so on.

image.png

EXAMPLE

import re
s = 'A! B. C D'
pattern = r'\W+'
l = re.split(pattern, s)
print(l)
['A', 'B', 'C', 'D']

FLAGS

FLAGS

1. re.A or re.ASCII (ASCII-only matching):

import re

pattern = re.compile(r'\w+', flags=re.ASCII)
text = "Hello, 你好"
matches = pattern.findall(text)
print(matches)  # Output: ['Hello']
['Hello']

2. re.I or re.IGNORECASE (case-insensitive matching):

import re

pattern = re.compile(r'hello', flags=re.IGNORECASE)
text = "Hello, World!"
match = pattern.search(text)
print(match.group())  # Output: 'Hello'
Hello

3. re.M or re.MULTILINE (multi-line matching):

import re

pattern = re.compile(r'^\w+', flags=re.MULTILINE)
text = "Line 1\nLine 2\nLine 3"
matches = pattern.findall(text)
print(matches)  # Output: ['Line', 'Line', 'Line']
['Line', 'Line', 'Line']

4. re.S or re.DOTALL (dot matches any character including newline):

import re

pattern = re.compile(r'.+', flags=re.DOTALL)
text = "Line 1\nLine 2\nLine 3"
match = pattern.search(text)
print(match.group())  # Output: 'Line 1\nLine 2\nLine 3'
Line 1
Line 2
Line 3

5. re.X or re.VERBOSE (allow comments in regex):

import re

pattern = re.compile(r'''
    \d+     # Match one or more digits
    \s*     # Match zero or more whitespace characters
    [a-zA-Z]+  # Match one or more letters
''', flags=re.VERBOSE)
text = "123   ABC"
match = pattern.match(text)
print(match.group())  # Output: '123   ABC'
123   ABC