网页元素的解析,本章重点如何解析A标签。
网页元素示例(浏览器截取,供参考)
<div class="related-tags">
<span>相关主题推荐:span>
a<a href='http://www.csdn.net/tag/标签' target="_blank">标签a>
<a href="http://www.csdn.net/tag/java" target="_blank">javaa>
<a href="http://www.csdn.net/tag/团购" target="_blank">团购a>
<a href="http://www.csdn.net/tag/体育" target="_blank">体育a>
<a href="http://www.csdn.net/tag/搜狐" icon='a' target="_blank">搜狐a>
<img src="/i/eg_tulip2.jpg" alt="上海鲜花港 - 郁金香"><img src="/i/eg_tulip.jpg" alt="上海鲜花港 - 郁金香" /> div>
网页元素分为对称的标签和非堆成标签。
对称标签如<a>xxxa>,非对称标签如<img/>
(1)因此此处定义为isSingleNode,是否为单节点标签
/**
* 网页节点
*
* @author leng
*
*/
public class WebNode implements Serializable {
/**
* 节点中间的内容
*/
private String content;
/**
* 参数
*/
private TreeMap<String, String> params = new TreeMap<>();
/**
* 标签类型
*/
private String labelType;
/**
* 是否单节点
*/
private boolean isSingleNode;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public TreeMap<String, String> getParams() {
return params;
}
public void setParams(TreeMap<String, String> params) {
this.params = params;
}
public String getLabelType() {
return labelType;
}
public void setLabelType(String labelType) {
this.labelType = labelType;
}
public boolean isSingleNode() {
return isSingleNode;
}
public void setSingleNode(boolean isSingleNode) {
this.isSingleNode = isSingleNode;
}
}
(2)字符串的操作类,辅助作用
/**
* 字符串操作
*
* @author leng
*
*/
public class StrUtil {
public static boolean isEmpty(String string){
return null==string||string.equals("");
}
public static boolean isNotEmpty(String string){
return null!=string&&!string.equals("");
}
/**
* 将字符串string重复n次,通过seg进行分割连接
*
* @param string 需要重复的字符串
* @param n 重复次数
* @param seg 分割字符串,如果等于null,则为""
* @return
*/
public static String repeatString(String string, int n, String seg) {
if (null == seg) {
seg = "";
}
/**
* 并非不用其他方法实现,因为这种方法是性能最好的写法
*/
StringBuffer sb = new StringBuffer();
for (int i = 0; i < n; i++) {
sb.append(string).append(seg);
}
return sb.substring(0, sb.length() - seg.length());
// return String.join(seg, Collections.nCopies(n, string));
}
/**
* 将num重复n次,通过seg进行分割符号连接
*
* @param num 需要重复的数值
* @param n 重复次数
* @param seg 分割字符串,如果等于null,则为""
* @return
*/
public static String repeatString(int num, int n, String seg) {
return repeatString(num + "", n, seg);
}
/**
* 将num重复n次,分割字符串为英文逗号","
*
* @param num 需要重复的数值
* @param n 重复次数
* @param seg 分割字符串,如果等于null,则为""
* @return
*/
public static String repeatString(int num, int n) {
return repeatString(num + "", n, ",");
}
/**
* 将int数组转换成字符串
*
* @param intArray
* @return
*/
public static String IntArrayToString(Integer[] intArray) {
if (null != intArray) {
StringBuffer sb = new StringBuffer();
for (Integer a : intArray) {
if (null != a) {
sb.append(",").append(a);
}
}
if (sb.length() > 0) {
return sb.substring(1);
}
}
return null;
}
/**
* 字符串str是否存在于字符串array数组中
*
* @param array 字符串数组
* @param str 字符串
* @param ignoreCase 是否忽略大小写
* @return
*/
public static boolean isExist(String[] array, String str, boolean ignoreCase) {
if (null != array && null != str) {
for (String s : array) {
if (ignoreCase) {
if (s.equalsIgnoreCase(str)) {
return true;
}
} else {
if (s.equals(str)) {
return true;
}
}
}
}
return false;
}
/**
* 将字符串按照splitStr进行分割,并转换成{@code List}集合
*
* @param string
* @param splitStr
* @return
*/
public static List<Integer> parseList(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
List<Integer> list = new ArrayList<>(arrays.length);
for (String str : arrays) {
try {
if (isNotEmpty(str)) {
Integer num = Integer.parseInt(str);
list.add(num);
}
} catch (Exception e) {
}
}
return list;
}
/**
* 将字符串按照逗号,进行分割,并转换成{@code List}集合
*
* @param string
* @return
*/
public static List<Integer> parseList(String string) {
return parseList(string, ",");
}
/**
* 将字符串按照splitStr进行分割,并转换成{@code Set}集合
*
* @param string
* @param splitStr
* @return
*/
public static Set<Integer> parseSet(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
Set<Integer> set = new HashSet<>();
for (String str : arrays) {
try {
if (isNotEmpty(str)) {
Integer num = Integer.parseInt(str);
set.add(num);
}
} catch (Exception e) {
}
}
return set;
}
/**
* 将字符串按照逗号,进行分割,并转换成{@code Set}集合
*
* @param string
* @return
*/
public static Set<Integer> parseSet(String string) {
return parseSet(string, ",");
}
/**
* 将字符串按照splitStr进行分割,并转换成{@code Set}集合
*
* @param string
* @param splitStr
* @return
*/
public static Set<String> parseSetString(String string, String splitStr) {
if (isEmpty(string)) {
return null;
}
String[] arrays = string.split(splitStr);
Set<String> set = new HashSet<>();
for (String str : arrays) {
try {
set.add(str);
} catch (Exception e) {
}
}
return set;
}
/**
* 将字符串转换成map,示例 string="1-2,3-4,5-6",seg1=",",seg2="-" 转换为map后,{1=2,3=4,5=6}
*
* @param string 被切割的字符串
* @param seg1 分割符号1
* @param seg2 分割符号2
* @return
*/
public static Map<String, String> stringToMap(String string, String seg1, String seg2) {
if (isNotEmpty(string)) {
String[] arr = string.split(seg1);
Map<String, String> map = new HashMap<>(arr.length);
for (String s : arr) {
if (isNotEmpty(s)) {
String[] arr2 = s.split(seg2);
if (arr2.length == 1) {
map.put(arr2[0], null);
} else if (arr2.length >= 2) {
map.put(arr2[0], arr2[1]);
}
}
}
return map;
}
return null;
}
/**
* 将字符串转换成map,示例 string="1-2,3-4,5-6" 转换为map后,{1=2,3=4,5=6}
*
* @param string
* @return
*/
public static Map<String, String> stringToMap(String string) {
return stringToMap(string, ",", "-");
}
/**
* 替换首逗号,如果字符串的开头不是逗号,则原样返还
*
* @param str
* @return
*/
public static String repaceFirstComma(String str) {
if (null == str) {
return null;
}
if (str.startsWith(",")) {
return str.substring(1);
}
return str;
}
/**
* 字符串数组转int数组
*
* @param array
* @return
*/
public static int[] strArrayToIntArray(String[] array) {
int[] intArray = new int[0];
for (String s : array) {
if (isNotEmpty(s)) {
intArray = Arrays.copyOf(intArray, intArray.length + 1);
intArray[intArray.length - 1] = Integer.parseInt(s);
}
}
return intArray;
}
/**
* 去掉指定字符前后的空格
*
* @param string
* @param specialStr
* @return
*/
public static String trim(String string, String specialStr) {
if (null == string) {
return null;
}
String s1 = " " + specialStr;
while (string.indexOf(s1) > -1) {
string = string.replaceAll(s1, specialStr);
}
String s2 = specialStr + " ";
while (string.indexOf(s2) > -1) {
string = string.replaceAll(s2, specialStr);
}
return string;
}
public static void main(String[] args) {
int times = 300_000_000;
}
}
(3)核心的解析
/**
* 网页工具
*
* @author leng
*
*/
public class WebpageUtil {
/**
* 从网页中获取标题
*
* @param html
* @return
*/
public static String getTitleFromHtmlString(String html) {
int startIndex = html.indexOf("" );
int endIndex = html.indexOf("");
if (startIndex == -1 || endIndex == -1) {
return null;
}
String title = html.substring(startIndex + ("" .length()), endIndex);
return title;
}
/**
* 获取节点(不支持节点内包含节点)
*
* @param html
* @return
*/
public static List<String> getNode(String html, String nodeType) {
String endStr = "" + nodeType + ">";
String regex;
if (html.indexOf(endStr) > -1) {
regex = "(.*?)";// 成对标签的元素(例如)
} else {
regex = "";// 单标签的元素(例如)
}
regex = regex.replaceAll("a", nodeType);
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
ArrayList<String> list = new ArrayList<String>();
while (m.find()) {
String link = m.group(0).trim();
list.add(link);
}
return list;
}
/**
* 获取节点
*
* @param html
* @return
*/
public static List<WebNode> getNode2(String html, String labelType) {
String endStr = "" + labelType + ">";
String regex;
boolean isSingleNode = false;
if (html.indexOf(endStr) > -1) {
regex = "(.*?)";// 成对标签的元素(例如)
} else {
isSingleNode = true;
regex = "";// 单标签的元素(例如)
}
regex = regex.replaceAll("a", labelType);
Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(html);
List<WebNode> list = new ArrayList<>();
while (m.find()) {
WebNode node = new WebNode();
node.setLabelType(labelType);
node.setSingleNode(isSingleNode);
String link = m.group(0).trim();
if (!isSingleNode) {
int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
int endIndex = link.lastIndexOf("<");
if (startIndex > -1 && endIndex > -1 && endIndex > startIndex) {
String content = link.substring(startIndex + 1, endIndex);
node.setContent(content);
}
}
int startIndex2 = link.indexOf("<" + labelType);
String str = null;
if (isSingleNode) {
str = link.substring(startIndex2 + ("<" + labelType).length());
} else {
int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");
str = link.substring(startIndex2 + ("<" + labelType).length(), startIndex);
}
String[] arr = StrUtil.trim(str, "=").split(" ");
if (null != arr && arr.length > 0) {
TreeMap<String, String> params = new TreeMap<>();
for (String s : arr) {
if (s.indexOf("=") > -1) {
String key = s.substring(0, s.indexOf("="));
String value = s.substring(s.indexOf("=") + 1);
String k = key, v = value;
if (key.startsWith("\'") || key.startsWith("\"")) {
k = key.substring(1, key.length() - 1).trim();
}
if (value.startsWith("\'") || value.startsWith("\"")) {
v = value.substring(1, value.length() - 1).trim();
}
params.put(k, v);
}
}
node.setParams(params);
}
list.add(node);
}
return list;
}
}
(4)测试
public static void main(String[] args) {
String str = " ";
List<String> list = WebpageUtil.getNode(str, "a");
System.out.println("-------a标签原内容------");
list.forEach(c -> {
System.out.println(c);
});
List<WebNode> list2 = WebpageUtil.getNode2(str, "a");
System.out.println("-------解析后的数据------");
list2.forEach(c -> {
System.out.println("内容:"+c.getContent());
System.out.println("标签类型:"+c.getLabelType());
System.out.println("标签的参数:"+c.getParams());
});
}
执行后输出,可以看到网页div里的a标签信息被解析出来。