1:限制长度(取95位)
String reg = "^(.{95}).*$"; result = result.replaceAll(reg,"$1");
2:过滤html标签
public static String delHTMLTag(String htmlStr) { String regEx_script = "]*?>[\\s\\S]*?<\\/script>"; // 定义script的正则表达式 String regEx_style = "]*?>[\\s\\S]*?<\\/style>"; // 定义style的正则表达式 String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式 String regEx_space = "\\s*|\t|\r|\n";// 定义空格回车换行符 String regEx_w = "]*?>[\\s\\S]*?<\\/w[^>]*?>";//定义所有w标签 Pattern p_w = Pattern.compile(regEx_w, Pattern.CASE_INSENSITIVE); Matcher m_w = p_w.matcher(htmlStr); htmlStr = m_w.replaceAll(""); // 过滤script标签 Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE); Matcher m_script = p_script.matcher(htmlStr); htmlStr = m_script.replaceAll(""); // 过滤script标签 Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE); Matcher m_style = p_style.matcher(htmlStr); htmlStr = m_style.replaceAll(""); // 过滤style标签 Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); Matcher m_html = p_html.matcher(htmlStr); htmlStr = m_html.replaceAll(""); // 过滤html标签 Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE); Matcher m_space = p_space.matcher(htmlStr); htmlStr = m_space.replaceAll(""); // 过滤空格回车标签 htmlStr = htmlStr.replaceAll(" ", ""); //过滤 return htmlStr.trim(); // 返回文本字符串 }
3: [\u4e00-\u9fa5_a-zA-Z0-9] 匹配中文,字母,数字
4:保留日期格式的数据
^: 非
result = Pattern.compile("[^0-9||年月日时_-]").matcher(result).replaceAll("");
5:
舍去<后边(右边)的内容(包括<)
result = result.split("<")[0];
舍去<前边(左边)的内容(包括<)
result = result.split("<")[1];
6: 舍去日后边(右边)的内容(保留日)
result = result.replaceAll("日(.*)","日");
7:selector的使用
public static String rules1(String html){ String result =""; Document document = Jsoup.parse(html); Elements elements = document.select("#admin"); for(Element element : elements){ result = element.text(); } return result; }
8: xpath的使用
public static String rules1(String html){ String result =""; String xpath="//*[@id="admin"]"; JXDocument jxDocument =new JXDocument(html); List rs = jxDocument.sel(xpath); for (Object o:rs){ if (o instanceof Element){ int index = ((Element) o).siblingIndex(); } result = o.toString(); } return result; }