基于正则表达式的Java爬虫项目
需求分析:抓取新闻网前100条新闻标题以及对应的网页新闻的链接
编者这里以齐鲁工业大学校园新闻网为示例,利用Java网络编程、多线程、正则表达式来实现对于新闻内容的抓取。(注:由于校园网限制,不连接齐鲁工业大学校园网可能暂时无法抓取全部内容或抓取的内容会存在缺失是正常情况)
源代码示例:
import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.regex.Matcher; import java.util.regex.Pattern; public class NetUrl { //网络爬虫 //1、向页面发送请求 //2、获取页面html---纯文本 //3、解析要的内容---rege、xml、jsoup、json //reg 字面量、字符类、数量?+ * public static void testUrl () { try { // private Lock lock=new ReentrantLock(); // for (int j = 1; j <= 100; j++) { //// StringBuffer sb0=new StringBuffer(); //// sb0.append(http://www.qlu.edu.cn/gdyw/list+j+.htm); String content = https://www.qlu.edu.cn/gdyw/list + j + .htm; //// System.out.println(sb0.toString()); // System.out.println(content = + content); URL url = new URL(content); URLConnection urlConnection = url.openConnection(); // System.out.println(urlConnection.getContentEncoding() = + urlConnection.getContentEncoding()); // System.out.println(urlConnection.getContentLengthLong() = + urlConnection.getContentLengthLong()); // System.out.println(urlConnection.getContentType() = + urlConnection.getContentType()); //层层嵌套 InputStream ins = urlConnection.getInputStream(); InputStreamReader inr = new InputStreamReader(ins); BufferedReader br = new BufferedReader(inr); StringBuffer sb = new StringBuffer(); for (String l = br.readLine(); l != null; l = br.readLine()) { sb.append(l); } String newsList = sb.toString(); try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } // System.out.println(newsList = + newsList); Pattern pattern = Pattern.compile(((<li class=\news n\\d{1,} clearfix\>)(.*?)(</li>))+); Matcher matcher = pattern.matcher(newsList); // boolean fd = matcher.find(); // System.out.println(fd = + fd); Pattern atag = Pattern.compile((<a href=')(.*?)(')(.*title=')(.*?)(')); while (matcher.find()) { // System.out.println(matcher.groupCount() = + matcher.groupCount()); // System.out.println(matcher.group(1) = + matcher.group(1)); // System.out.println(matcher.group(2) = + matcher.group(2)); String li = matcher.group(3); Matcher aMatcher = atag.matcher(li); if (aMatcher.find()) { System.out.println(href = + aMatcher.group(2)); System.out.println(title = + aMatcher.group(5)); } // 网页示例 <span style=text-indent:0.99cm;font-size:18px>迈着铿锵的步伐,我们即将挥手告别满载收获与喜悦的</span> URL url2 = new URL(https://www.qlu.edu.cn/+aMatcher.group(2)); System.out.println(url2 = + url2); URLConnection urlConnection2 = url.openConnection(); InputStream ins2 = urlConnection.getInputStream(); InputStreamReader inr2 = new InputStreamReader(ins2); BufferedReader br2 = new BufferedReader(inr2); StringBuffer sb2=new StringBuffer(); for (String m = br2.readLine(); m != null; m = br2.readLine()) { sb2.append(m); } Pattern pattern2=Pattern.compile(((<span style=\text-indent:0.99cm;font-size:18px\>)(.*?)(</span>))+); String newSpan = sb2.toString(); Matcher matcher1=pattern.matcher(newSpan); while (matcher1.find()) { boolean fd=matcher1.find(); System.out.println(fd = + fd); System.out.println(matcher1.groupCount() = + matcher1.groupCount()); String span = matcher.group(3); System.out.println(newSpan = + newSpan); System.out.println(matcher1.group(3) = + matcher1.group(3)); } // System.out.println(matcher.group(4) = + matcher.group(4)); } System.out.println(Thread.currentThread().getName()+_____________——抓取); ins.close(); inr.close(); br.close(); // System.out.println(content); } } catch(Exception e){ System.out.println(e.getMessage() = + e.getMessage()); } } public static void main(String[] args) throws Exception { // 1.创建一个五个线程的线程池 ExecutorService es=Executors.newFixedThreadPool(5); // 2.提交任务 Runnable run =new Runnable() { @Override public void run() { while(true){ synchronized(NetUrl.class) { testUrl(); } } } }; // 3.提交任务 for (int i=0;i <= 5;i++){ es.submit(run); } // 4.关闭线程池 es.shutdown(); } }
上述为网页源代码的示例: