目录
正则的目的
re模块是一个标准库,无需安装
官方文档:https://docs.python.org/3/library/re.html
- >>> import re
- >>> result = re.search("sanchuang","hello world,this is sanchuang") # 前面写匹配字段,后面写要匹配的字符串
- >>> result
- <_sre.SRE_Match object; span=(20, 29), match='sanchuang'> # 这是一个match对象
- >>> result = re.search("sanchuang1","hello world,this is sanchuang") # 这里没有匹配到字符串
- >>> result
- >>> print(result)
- None
-
- >>> result = re.search("san.*$","hello world,this is sanchuang") # 匹配"san"开头的往后所有的字符串
- >>> result
- <_sre.SRE_Match object; span=(20, 29), match='sanchuang'>
-
- >>> result = re.match("san.*$","hello world,this is sanchuang")
- >>> result
- >>> result = re.match("hello","hello world,this is sanchuang")
- >>> result
- <_sre.SRE_Match object; span=(0, 5), match='hello'>
-
- # 如果匹配到了就会返回一个match对象;如果没有匹配上那就返回一个None
- """
- """
- # match只能从字符串开头查找,开始的部分没有,那就匹配不上。也就是说只匹配字符串的开始
-
- >>> result = re.search("sanchuang","hello world,this is sanchuang sanchuang") # 这个匹配是一个字符一个字符匹配的
- >>> result
- <_sre.SRE_Match object; span=(20, 29), match='sanchuang'>
- >>> dir(result)
- ['__class__', '__copy__', '__deepcopy__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'end', 'endpos', 'expand', 'group', 'groupdict', 'groups', 'lastgroup', 'lastindex', 'pos', 're', 'regs', 'span', 'start', 'string']
- >>> result.start()
- 20
- >>> result.end() # 匹配的是一个左闭右开区间
- 29
- >>> result.group() # 显示匹配的结果
- 'sanchuang'
如上代码所示:
作用:找到多个匹配
- """
- >>> msg = "i love pythonpython1python2"
- >>> re.findall("python",msg)
- ['python', 'python', 'python']
- >>> re.finditer("python",msg) # finditer出来的是一个迭代器
- >>> for i in re.finditer("python", msg):
- ... print(i)
- ...
- <_sre.SRE_Match object; span=(7, 13), match='python'>
- <_sre.SRE_Match object; span=(13, 19), match='python'>
- <_sre.SRE_Match object; span=(20, 26), match='python'>
- """
re.sub('匹配正则' , '替换内容' , 'string')
- >>> msg = "i love python1 pythonyy python123 pythontt"
-
- # /d,表示[0-9]
- >>> re.sub("python\d","**",msg) # 第一参数表示你要匹配的字符串,第二参数表示你要替换的字符串,第三个参数表示你要进行替换操作的字符串
- 'i love ** pythonyy **23 pythontt'
编译正则的特点:
- >>> msg = "i love python1 pythonyy python123 pythontt"
- >>> reg = re.compile("python[0-9]")
- >>> reg.findall(msg)
- ['python1', 'python1']
最简单的正则表达式是那些仅包含简单的字母数字字符的表达式,复杂的正则可以实现强大的匹配
- import re
- ret = re.findall("[Pp]y","Python3 pth ppython") # [Py]只占一个字符的位置
- print(ret)
-
- ret1 = re.findall("[Pp]yf","Python3 pth ppython") # 没有匹配上就是一个空列表
- print(ret1)
- # 按ASCILL编码排序,前面字符的编码一定要比后面字符的编码小
- ret2 = re.findall("[0-9a-z]y","Python3 pth ppython") # [Py]只占一个字符的位置
- print(ret2)
-
- #####result
- ['Py', 'py']
- []
- ['py']
- ret = re.findall("[^A-z]y","Python3 1yth ppython")
- print(ret)
-
- #######result
- ['1y']
- msg = 'xkjsdfjlaskdf lksdjf aslkfjdsfd'
- print(re.findall("sk|lk|as", msg))
-
- #####result
- ['as', 'lk', 'as', 'lk']
占位符,表示除了换行符以外的任意一个字符
- ret = re.findall("p.thon","Python python pgthon pthon p thon p\nthon")
- print(ret)
-
- #######result
- ['python', 'pgthon', 'p thon']
- # \A 匹配字符串的开始
- # \b 词边界
- # \B 非词边界
- # \w 匹配单词字符,数字和中文是可以匹配单词字符
- # \W 匹配非单词字符
- # \s 匹配空白字符,如:换行符、制表符
- # \S 匹配非空白字符
- # \d 匹配数字
- # \D 匹配非数字
- import re
- # 正则表达式前面加r:抑制转义
- # 原样交给正则表达式引擎去匹配
- ret = re.findall(r'\bword', "word abcword 中word #word word123") #python写好之后交给正则表达式的独立引擎去去执行
- # abcword 会被认为是一个单词
- # 123word ,123是单词字符
- print(ret)
-
- ##### result
- ['word', 'word', 'word'] # 相应匹配的是word #word word123
# 匹配开始:^ # 匹配结束:$
- msg = """Python
- python
- python123
- python345
- """ # 这里实际上是"Python\npython\npython123\npython345"
- ret = re.findall("^[Pp]ython",msg)
- print(ret)
-
- #####result
- ['Python']
放在findall的第三个参数这里
- import re
-
- msg = """Python
- python
- python123
- python345
- """ # 这里实际上是"Python\npython\npython123\npython345"
-
- ret = re.findall("^[Pp]ython",msg,re.M,)
- print(ret)
-
- ret = re.findall("^python",msg,re.M|re.I)
- print(ret)
-
- ret = re.findall("python.*",msg,re.I)
- print(ret)
-
- ret = re.findall("python.*",msg,re.S|re.I)
- print(ret)
-
- ####### result
- ['Python', 'python', 'python', 'python']
- ['Python', 'python', 'python', 'python']
- ['Python', 'python', 'python123', 'python345']
- ['Python\npython\npython123\npython345\n']

- # ? 匹配前一项0或者1次
- ret = re.findall("py?","py p python pyython")
- print(ret)
-
- ######result
- ['py', 'p', 'py', 'py']
- # + 匹配前一项一次以上
- ret = re.findall("py+","py p python pyython")
- print(ret)
-
- ####result
- ['py', 'py', 'pyy']
-
- # * 匹配前一项任意次
- ret = re.findall("py*","py p python pyython")
- print(ret)
-
- #### result
- ['py', 'p', 'py', 'pyy']
- # {n,m} 匹配前一项n到m次,{n,} {,m}
- ret = re.findall("py{2,4}","pyyy py pyyyypyypy")
- print(ret)
-
-
- ###### result
- ['pyyy', 'pyyyy', 'pyy']
思考:当我们匹配第一个字段和第三个字段的时候有"pyyy"和"pyyyy",那么为什么不能匹配到"pyy"就退出呢?
那是因为python的默认模式是贪婪模式。
- import re
- msg = "<div>testdiv>bb<div>tedst2div>"
- ret = re.findall("<div>.*div>",msg) # 贪婪模式
- print(ret)
- ret = re.findall("<div>.*?div>",msg) # 非贪婪模式 + * {n,m} 后面接?都是非贪婪模式
- print(ret)
-
- ######result
- ['<div>testdiv>bb<div>tedst2div>']
- ['<div>testdiv>', '<div>tedst2div>']
当使用分组时,除了可以获得整个匹配,还能够获得选择每一个单独组,使用 () 进行分组
捕获分组和非捕获分组
捕获分组(正则表达式)
分组匹配上之后会把匹配上的数据放到内存中,并给定一个从1开始的索引
- ret = re.search(r"(\d{3})-(\d{3})-(\d{3})","abc123-456-789-aaa")
- print(ret.group())
- print(ret.group(1))
- print(ret.group(2))
- print(ret.group(3))
-
- ####result
- 123-456-789
- 123
- 456
- 789
非捕获分组(?:正则表达式)
只分组,不捕获,不会分配内存和下标去保存
- ret = re.search(r"(\d{3})-(?:\d{3})-(\d{3})","abc123-456-789-aaa")
- print(ret.group())
- print(ret.group(1))
- print(ret.group(2))
-
- #######result
- 123-456-789
- 123
- 789
使用()分用,用\0, \1, \2引用 (\0表示匹配的整个串)
- msg = "aa bb aa bb"
- print(re.search(r"(\w+)\s(\w+)\s\1\s\2", msg)) # "\1"代表第一个分组
-
- msg = "aa bb aa cc"
- print(re.search(r"(\w+)\s(\w+)\s\1\s\2", msg)) # "\1"代表第一个分组
-
- ##### result
object; span=(0, 11), match='aa bb aa bb'> - None
- # 使用findall 有捕获分组,只会显示捕获分组的内容
- msg = "aa bb aa bb"
- print(re.findall(r"(\w+)\s(\w+)\s\1\s\2", msg)) # "\1"代表第一个分组
-
- msg = "comyy@xx.comcom123@qq.comaa@126.combbb@163.comcc@abc.com"
- # 把里面123 126 163邮箱捞出来
- # aaa@126.com bbb@163.com
- print(re.findall(r"(?:\.com)?(\w+@(?:123|126|163).com)",msg))
-
-
- #####result
- [('aa', 'bb')]
- ['aa@126.com', 'bbb@163.com']
作用:不占用匹配宽度,只用来确定位置
- s = "sc1 hello sc2 hello"
-
- # 匹配后面是sc2的hello
- print(re.findall(r"\w+ hello(?= sc2)",s))
- # 匹配后面不是sc2的hello
- print(re.findall(r"\w+ hello(?! sc2)",s))
-
- # 匹配前面是sc2 的hello
- print(re.findall(r"(?<=sc2 )hello",s))
- # 匹配前面不是sc2 的hello
- print(re.findall(r"(?,s))
-
-
- #####result
- ['sc1 hello']
- ['sc2 hello']
- ['hello']
- ['hello']
- #####截取出下面内容的IP地址
- msg = """
- 1: lo:
mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 - link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
- inet 127.0.0.1/8 scope host lo
- valid_lft forever preferred_lft forever
- inet6 ::1/128 scope host
- valid_lft forever preferred_lft forever
- 2: ens33:
mtu 1500 qdisc pfifo_fast state UP group default qlen 1000 - link/ether 00:0c:29:16:6a:88 brd ff:ff:ff:ff:ff:ff
- inet 192.168.29.128/24 brd 192.168.29.255 scope global ens33
- valid_lft forever preferred_lft forever
- inet6 fe80::20c:29ff:fe16:6a88/64 scope link
- valid_lft forever preferred_lft forever
- 3: ens37:
mtu 1500 qdisc pfifo_fast state UP group default qlen 1000 - link/ether 00:0c:29:16:6a:92 brd ff:ff:ff:ff:ff:ff
- inet 192.168.29.71/24 brd 192.168.29.255 scope global ens37
- valid_lft forever preferred_lft forever
- inet6 fe80::20c:29ff:fe16:6a92/64 scope link
- valid_lft forever preferred_lft forever
- """
- print(re.findall(r"(?<=inet ).*(?=/24)",msg)[0])
-
-
- ##### result
- 192.168.29.128