很少用正则表达式(为什么?不知道,但很有用),每次用到总是要重新查,然后重新试验后使用。
整理Python正则表达式帮助内容,学习和理解,蓝色标记及空白有待完善和补充。
This module exports the following functions, 函数 | s = 'abca' | |||
标识 | 英文描述 | 中文描述 | 用法示例 | |
match | Match a regular expression pattern to the beginning of a string. | 从字符串开始(第一个字符)匹配模式串 | match(pattern, string, flags=0) | re.match('a',s) |
fullmatch | Match a regular expression pattern to all of a string. | 匹配字符串与模式串是否一致 | fullmatch(pattern, string, flags=0) | re.fullmatch('abca',s) >>> re.fullmatch('abc',s) >>> None |
search | Search a string for the presence of a pattern. | 在字符串中匹配模式串 | search(pattern, string, flags=0) | re.search('a',s) >>> re.search('bc',s) >>> re.search('d',s) >>> None |
sub | Substitute occurrences of a pattern found in a string. | 替换字符串中匹配的模式串,默认全部 count设定匹配替换的个数 | sub(pattern, repl, string, count=0, flags=0) | re.sub('a','A',s) >>> AbcA re.sub('a','A',s,1) >>> Abca re.sub('abc','ABC',s) >>> ABCa re.sub('d','A',s) >>> abca |
subn | Same as sub, but also return the number of substitutions made. | 同sub,同时返回模式串出现的次数,返回元组 | subn(pattern, repl, string, count=0, flags=0) | re.subn('a','A',s) >>> ('AbcA', 2) re.subn('a','A',s)[1] >>> 2 re.subn('d','A',s) >>> ('abca', 0) |
split | Split a string by the occurrences of a pattern. | 按模式拆分字符串,匹配的模式串显示为空,返回列表 maxsplit设定匹配和拆分的个数 | split(pattern, string, maxsplit=0, flags=0) | re.split('a',s) >>> ['', 'bc', ''] re.split('a',s,1) >>> ['', 'bca'] |
findall | Find all occurrences of a pattern in a string. | 按模式返回字符串中所有匹配 | findall(pattern, string, flags=0) | re.findall('a',s) >>> ['a', 'a'] |
finditer | Return an iterator yielding a Match object for each match. | 同findall,返回match对象list | finditer(pattern, string, flags=0) | re.finditer('a',s) >>> list(re.finditer('a',s)) >>> [ |
compile | Compile a pattern into a Pattern object. | 创建模式串对象 | compile(pattern, flags=0) | mo = re.compile('a') re.findall(mo,s) >>> ['a', 'a'] |
purge | Clear the regular expression cache. | 清空正则表达式缓存 | purge() | 具体意义不明确,待验证 |
escape | Backslash all non-alphanumerics in a string. | 转译,所有非字母数字添加反斜杠 | escape(pattern) | re.escape('\\') >>> '\\\\' re.escape('a') >>> 'a' re.escape('1') >>> '1' re.escape('#') >>> '\\#' |
The special characters are,特殊字符 | s = 'abcdaabc \n reg' | |||
标识 | 英文描述 | 中文描述 | 用法示例 | |
. | Matches any character except a newline. | 匹配除\n换行字符外所有字符串,返回list | re.findall('.',s) >>> ['a', 'b', 'c', 'd', 'a', 'a', 'b', 'c', ' ', ' ', 'r', 'e', 'g'] | |
^ | Matches the start of the string. | 从字符串开始匹配 | re.findall('^a',s) >>> ['a'] re.findall('^b',s) >>> [] | |
$ | Matches the end of the string or just before the newline at the end of the string. | 以字符串结束开始匹配 | re.findall('g$',s) >>> ['g'] re.findall('a$',s) >>> [] | |
* | Matches 0 or more (greedy) repetitions of the preceding RE. | 匹配0或更多重复字符(*前一字符或字符段) | re.findall('a*',s) # 匹配包含非a 或 1个a 或 多个a的子字符串 >>> ['a', '', '', '', 'aa', '', '', '', '', '', '', '', '', ''] re.findall('abcc*',s) >>> ['abc', 'abc'] | |
+ | Matches 1 or more (greedy) repetitions of the preceding RE. | 匹配1或更多重复字符(+前一字符或字符段) | re.findall('a+',s) # 匹配包含a 或 多个a的子字符串 >>> ['a', 'aa'] re.findall('abcc+',s) >>> [] | |
? | Matches 0 or 1 (greedy) of the preceding RE. | 匹配0或1个字符(?前一字符或字符段) | re.findall('a?',s) # 匹配1个a 或 非a的子字符串 >>> ['a', '', '', '', 'a', 'a', '', '', '', '', '', '', '', '', ''] re.findall('abcc?',s) >>> ['abc', 'abc'] | |
*?,+?,?? | Non-greedy versions of the previous three special characters. | 以?结束贪婪匹配(只匹配最少字符) | re.findall('a*?b',s) >>> ['ab', 'aab'] re.findall('a+?b',s) >>> ['ab', 'aab'] re.findall('a??b',s) >>> ['ab', 'ab'] | |
{m,n} | Matches from m to n repetitions of the preceding RE. | 匹配一个字符m到n次重复(前一字符或字符段) | re.findall('abc{0,0}',s) >>> ['ab', 'ab'] re.findall('abc{0,1}',s) >>> ['abc', 'abc'] | |
{m,n}? | Non-greedy version of the above. | 以?结束贪婪(只匹配最少字符),匹配指定长度 | re.findall('abc{0,1}?',s) >>> ['ab', 'ab'] re.findall('abc{0,1}?d',s) >>>['abcd'] | |
\\ | Either escapes special characters or signals a special sequence. | 匹配特殊字符或特殊序列 | re.findall('\\n',s) >>> ['\n'] | |
[] | Indicates a set of characters.A "^" as the first character indicates a complementing set. | 将[]中的字符串中的每个字符分别匹配,不包含特殊字符 第一个字符^ 非,不匹配^后的每个字符的模式 | re.findall('[a*b\n]',s) >>> ['a', 'b', 'a', 'a', 'b', '\n'] re.findall('[^a*b\n]',s) >>> ['c', 'd', 'c', ' ', ' ', 'r', 'e', 'g'] | |
| | A|B, creates an RE that will match either A or B. | or,匹配多种不同模式串 | re.findall('a*?b|\\n',s) >>> ['ab', 'aab', '\n'] | |
(...) | Matches the RE inside the parentheses.The contents can be retrieved or matched later in the string. | 按()中的模式串进行匹配并分组,不在()的不进行匹配 | re.findall('(a*)(bc+)(d?)',s) | |
(?aiLmsux) | The letters set the corresponding flags defined below. | |||
(?:...) | Non-grouping version of regular parentheses. | |||
(?P | The substring matched by the group is accessible by name. | 通过名称访问组匹配模式串 | re.search('(?P >>> {'TEST': 'abcdaabc '} | |
(?P=name) | Matches the text matched earlier by the group named name. | 按组名匹配第一个文本 | re.search('(?P >>> {'abc': ''} | |
(?#...) | A comment; ignored. | 注释 | re.search('(\w*)',s).group() >>> abcdaabc re.search('(?#Comment:\w*)',s).group() >>> | |
(?=...) | Matches if ... matches next, but doesn't consume the string. | 分组,满足(=...)开始并包括开始字符匹配串 | re.findall('(?=a)\w*',s) >>> ['abcdaabc'] re.findall('(?=a)\w*',s) >>> ['bcdaabc'] | |
(?!...) | Matches if ... doesn't match next. | 分组,非(=...)开始并包括开始字符匹配串 | re.findall('(?! )\w*',s) >>> ['abcdaabc', '', 'reg', ''] re.findall('(? >>> ['abcdaabc', '', '', 'eg', ''] | |
(?<=...) | Matches if preceded by ... (must be fixed length). | 分组,满足(=...)开始但不包括开始字符匹配串 | re.findall('(?<= )',s) # start with ''(blank) >>> ['', ''] re.findall('(?<= )\w*',s) # start with ''(blank) and follow with 数字、字母、下划线 >>> ['', 'reg'] | |
(? | Matches if not preceded by ... (must be fixed length). | 分组,非(=...)开始但不包括开始字符匹配串 | re.findall('(?<= )',s) # start not with ''(blank) >>> ['', '', '', '', '', '', '', '', '', '', '', '', ''] re.findall('(?<= )\w*',s) # start not with ''(blank) and follow with 数字、字母、下划线 >>> ['abcdaabc', '', '', 'eg', ''] | |
(?(id/name)yes|no) | Matches yes pattern if the group with id/name matched,the (optional) no pattern otherwise. |
The special sequences consist of "\\" and a character from the list below. If the ordinary character is not on the list, then the resulting RE will match the second character. | s = '0123456789 this is regular tist express!' | |||
标识 | 英文描述 | 中文描述 | 用法示例 | |
\number | Matches the contents of the group of the same number. | ()对字符串分组,在分组中使用\num表示引用第num个模式 | s = '122211' re.findall(r'(\d{1})(\d{1})(\d{1})',s) #分3组,每组1个数字 >>> [('1', '2', '2'), ('2', '1', '1')] re.findall(r'(\d{1})(\d{1})(\1)',s) #分3组,每组1个数字,第3组内容与第1组一致 >>> [('2', '2', '2')] | |
\A | Matches only at the start of the string. | 匹配以模式开始的字符串,^ | re.findall(r'\A.',s) >>> ['0'] | |
\Z | Matches only at the end of the string. | 匹配以模式结束的字符串,$ | re.findall(r'.\Z',s) >>> ['!'] | |
\b | Matches the empty string, but only at the start or end of a word. | 匹配单词边界(单词间空格) | re.findall(r'is\b',s) >>> ['is', 'is'] re.findall(r'\bis\b',s) >>> ['is'] re.findall(r'.is\b',s) >>> ['his', ' is'] | |
\B | Matches the empty string, but not at the start or end of a word. | 匹配非单词边界(除空格外字符) | re.findall(r'.is\B.',s) >>> ['tist'] | |
\d | Matches any decimal digit; equivalent to the set [0-9] in bytes patterns or string patterns with the ASCII flag. In string patterns without the ASCII flag, it will match the whole range of Unicode digits. | 匹配任意数字,等价于 [0-9] | re.findall(r'\d',s) >>> ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] | |
\D | Matches any non-digit character; equivalent to [^\d]. | 匹配任意非数字 | re.findall(r'\d',s) >>> [' ', 't', 'h', 'i', 's', ' ', 'i', 's', ' ', 'r', 'e', 'g', 'u', 'l', 'a', 'r', ' ', 't', 'i', 's', 't', ' ', 'e', 'x', 'p', 'r', 'e', 's', 's', '!'] | |
\s | Matches any whitespace character; equivalent to [ \t\n\r\f\v] in bytes patterns or string patterns with the ASCII flag. In string patterns without the ASCII flag, it will match the whole range of Unicode whitespace characters. | 匹配任意空白字符,等价于 [ \t\n\r\f] | re.findall(r'\s',s) >>> [' ', ' ', ' ', ' ', ' '] re.findall(r'\s[a-z]*',s >>> [' this', ' is', ' regular', ' tist', ' express'] | |
\S | Matches any non-whitespace character; equivalent to [^\s]. | 匹配任意非空字符 | re.findall(r'\S',s) >>> ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 't', 'h', 'i', 's', 'i', 's', 'r', 'e', 'g', 'u', 'l', 'a', 'r', 't', 'i', 's', 't', 'e', 'x', 'p', 'r', 'e', 's', 's', '!'] | |
\w | Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]in bytes patterns or string patterns with the ASCII flag.In string patterns without the ASCII flag, it will match the range of Unicode alphanumeric characters (letters plus digits plus underscore).With LOCALE, it will match the set [0-9_] plus characters defined as letters for the current locale. | 匹配字母数字及下划线 | re.findall(r'\s\w*',s) >>> [' this', ' is', ' regular', ' tist', ' express'] re.findall(r'\w',s) >>> ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 't', 'h', 'i', 's', 'i', 's', 'r', 'e', 'g', 'u', 'l', 'a', 'r', 't', 'i', 's', 't', 'e', 'x', 'p', 'r', 'e', 's', 's'] | |
\W | Matches the complement of \w. | 匹配非字母数字及下划线 | re.findall(r'\W*',s) >>> [' ', ' ', ' ', ' ', ' ', '!'] re.findall(r'\S\W',s) >>> ['9 ', 's ', 's ', 'r ', 't ', 's!'] | |
\\ | Matches a literal backslash. | 匹配反斜杠 | s = 'abc\\' re.findall(r'\\') >>> ['\\'] | |
说明 | ||||
Greedy means that it will match as many repetitions as possible. | 贪婪意味着匹配任意多个可能的重复 | |||
pattern | 模式串,要匹配的正则表达式 | |||
flag | 标志位,控制正则表达式的匹配方式 | 1. re.I(re.IGNORECASE): 忽略大小写 2. re.M(MULTILINE): 多行模式,改变'^'和'$'的行为 3. re.S(DOTALL): 点任意匹配模式,改变'.'的行为 4. re.L(LOCALE): 使预定字符类 \w \W \b \B \s \S 取决于当前区域设定 5. re.U(UNICODE): 使预定字符类 \w \W \b \B \s \S \d \D 取决于unicode定义的字符属性 6. re.X(VERBOSE): 详细模式。这个模式下正则表达式可以是多行,忽略空白字符,并可以加入注释 | ||
\ | 使用转译字符'\'时,由于字符串会自动转译,因为要使用 r'\b' 格式(增加 r 标识),以满足正则表达式的模式匹配 |
- import re
-
- s = '61052420220129128X'
-
- idcard = re.findall('(?P<省>\d{2})(?P<市>\d{2})(?P<县>\d{2})(?P<年>\d{4})(?P<月>\d{2})(?P<日>\d{2})(?P<派出所>\d{2})(?P<性别>\d{1})(?P<校验码>\d{1}|\D{1})',s)
- print("FindAll: ",idcard)
-
- idcard = re.search('(?P<省>\d{2})(?P<市>\d{2})(?P<县>\d{2})(?P<年>\d{4})(?P<月>\d{2})(?P<日>\d{2})(?P<派出所>\d{2})(?P<性别>\d{1})(?P<校验码>\d{1}|\D{1})',s).groupdict()
- print("Search1: ",idcard)
-
- print("Search2: ")
- for i in idcard.items():
- print(i[0],": ",i[1])
-
- >>>>>>>>>>>>>>>>>>>>>
- FindAll: [('61', '05', '24', '2022', '01', '29', '12', '8', 'X')]
-
- Search1: {'省': '61', '市': '05', '县': '24', '年': '2022', '月': '01', '日': '29', '派出所': '12', '性别': '8', '校验码': 'X'}
-
- Search2:
- 省 : 61
- 市 : 05
- 县 : 24
- 年 : 2022
- 月 : 01
- 日 : 29
- 派出所 : 12
- 性别 : 8
- 校验码 : X
-
参考: