import requests
from re import findall
#1.正则表达式:年月日
# (1|2)[0-9][0-9][0-9]-(0[1-9]|1[012]|[1-12])-(0[1-9]|1[0-9]|2[0-9]|3[01]|[1-31])
# 2.《红楼梦》五言和七言诗
# 《红楼梦》第一回的网页
#get()里面是第一回的网址
string = requests.get('https://www.xyyuedu.com/gdmz/sidamingzhu/hlmeng/21651.html') \
.text.encode("ISO-8859-1").decode('gbk').encode('utf8').decode('utf8')
strings = findall(r'((?:.|\n)*?)
', string)
str1 = str(strings)
str1 = findall(r'(?<=\S)*[\u4e00-\u9fa5]+[?。,:\-;]*(?=\S)*', str1)
str2 = "".join(str1)
str2 = "".join(str2.split())
print(str2) #string是网页内容字符串
# 五言诗
five = r'[\u4e00-\u9fa5]{5}[,?][\u4e00-\u9fa5]{5}[,?。][\u4e00-\u9fa5]{5}[,?。][\u4e00-\u9fa5]{5}[?。]'
fiveCharacter = findall(five, str2)
print(fiveCharacter)
# 七言诗
seven = r'[\u4e00-\u9fa5]{7}[,?][\u4e00-\u9fa5]{7}[,?。][\u4e00-\u9fa5]{7}[,?。][\u4e00-\u9fa5]{7}[?。]'
sevenCharacter = findall(seven, str2)
print(sevenCharacter)