基于Java的数据采集(三)
《基于Java的数据采集(一)》:http://www.cnblogs.com/lichenwei/p/3904715.html 《基于Java的数据采集(二)》:http://www.cnblogs.com/lichenwei/p/3905370.html 《基于Java的数据采集(终结篇)》:http://www.cnblogs.com/lichenwei/p/3910492.html 基于之前2篇Java数据采集入库,做了下功能整合,实现本地的存读取,上个效果图: 直接上代码吧,本程序只是作为"如何用JAVA抓取页面简单采集入库"的入门,在实际做采集工具的时候,还需考虑许多东西,比如当采集一个页面发生卡顿时,发生延迟时怎么办?等一系列的问题,希望这篇文字能够抛砖引玉。 先看下项目结构: 一共有五个类: MysqL.java --数据库操作类 RegEX.java ?--正则匹配类 GetAllData.java --采集类 Action.java --功能实现类 FootBallMain.java --主程序类 其他的,直接结合前面2篇文章外加看代码注释吧 MysqL.java 1 package com.lcw.curl; 2 3 4 import java.sql.Connection; 5 java.sql.DriverManager; 6 java.sql.ResultSet; 7 java.sql.sqlException; 8 java.sql.Statement; 9 10 11 /** 12 * 数据库操作类,一更新,一查询 13 * @author Balla_兔子 14 * 15 */ 16 public class MysqL { 17 18 //定义MysqL驱动,数据库地址,数据库用户名 密码,执行语句和数据库连接 19 public String driver = "com.MysqL.jdbc.Driver"; 20 public String url = "jdbc:MysqL://127.0.0.1:3306/football"21 public String user = "root"22 public String password = ""23 public Statement stmt = null24 public Connection conn = 25 26 创建一个插入数据的方法 27 void datatoMysqL(String insertsql) { 28 29 try { 30 31 Class.forName(driver).newInstance(); 32 } catch (Exception e) { 33 System.out.println("Unable to find the local driver"); 34 e.printStackTrace(); 35 } 36 创建连接 37 conn = DriverManager.getConnection(url,user,password); 38 创建一个 Statement 对象来将 sql 语句发送到数据库 39 stmt = conn.createStatement(); 40 } (sqlException e) { 41 e.printStackTrace(); 42 } 43 44 执行sql 插入语句 45 stmt.executeUpdate(insertsql); 46 } 47 48 49 50 stmt.close(); 51 conn.close(); 52 } 53 54 55 } 56 57 58 创建一个查找数据的方法 59 public ResultSet searchMysqL(String selectsql) { 60 61 ResultSet rs=62 63 64 65 66 } 67 System.out.println("Unable to find the local driver"68 69 70 71 conn =72 73 stmt =74 } 75 76 77 78 79 rs=stmt.executeQuery(selectsql); 80 } 81 82 83 84 return rs; 85 86 87 }MysqL.java RegEX.java 3 java.util.regex.Matcher; java.util.regex.Pattern; 5 RegEX { 7 8 9 * 10 * @param regex * 正则表达式 content * 所要匹配的内容 @return 15 16 String getData(String regex,String content) { 17 Pattern pattern = Pattern.compile(regex,Pattern.CASE_INSENSITIVE); 设定正则表达式,不区分大小写 18 Matcher matcher = pattern.matcher(content); 19 if (matcher.find()) { 20 return matcher.group();返回正则匹配结果 21 } else22 return ""23 24 25 26 }RegEX.java GetAllData.java java.io.BufferedReader; java.io.InputStreamReader; java.net.URL; 6 GetAllData { 8 9 /**采集类 11 12 getAllData() { 13 14 15 String address = "http://www.footballresults.org/league.PHP?league=EngDiv1"16 URL url = new URL(address); 17 InputStreamReader inputStreamReader = InputStreamReader(url 18 .openStream(),"utf-8"); 打开地址,以UTF-8编码的形式返回字节并转为字符 19 BufferedReader bufferedReader = BufferedReader( 20 inputStreamReader); 从字符输入流中读取文本,缓冲各个字符,从而提供字符、数组和行的高效读取。 21 22 RegEX data = RegEX(); 23 MysqL MysqL = MysqL(); 24 String content = ""; 用来接受每次读取的行字符 25 int flag = 0; 标志,队伍信息刚好在日期信息后面,则正则相同,用于分离数据 26 String dateRegex = "d{1,2}.d{1,2}.d{4}"; 日期匹配正则表达式 27 String teamRegex = ">[^<>]*</a>"; 队伍匹配正则表达式 28 String scoreRegex = ">(d{1,2}-d{1,2})</TD>"; 比分正则表达式 29 String tempDate = ""; 存储临时比赛时间 30 String teama = ""; 存储临时主队 31 String teamb = ""; 存储临时客队 32 String score = ""; 存储临时比分 33 int i = 0; 记录信息条数 34 String sql = ""; 数据库语句 35 while ((content = bufferedReader.readLine()) != null) { 每次读取一行数据 37 获取比赛日期信息 38 String dateInfo = data.getData(dateRegex,content); 39 if (!dateInfo.equals("")) { 40 System.out.println("日期:" + dateInfo); 41 tempDate = dateInfo; 42 flag++43 } 44 获取队伍信息,需先读到日期信息让标志符自增 45 String teamInfo = data.getData(teamRegex,1)">46 if (!teamInfo.equals("") && flag == 1) { 47 teama = teamInfo.substring(1,teamInfo.indexOf("</a>")); 48 System.out.println("主队:" + teama); 49 flag++50 } else if (!teamInfo.equals("") && flag == 251 teamb = teamInfo.substring(1,1)">52 System.out.println("客队:" + teamb); 53 flag = 055 获取比分信息 56 String scoreInfo = data.getData(scoreRegex,1)">57 if (!scoreInfo.equals(""58 score = scoreInfo.substring(1,scoreInfo.indexOf("</TD>"59 System.out.println("比分:" + score); 60 System.out.println(); 61 i++62 sql = "insert into football(`date`,`teama`,`teamb`,`score`) values('" 63 + tempDate 64 + "','" 65 + teama 66 + "',1)">67 + teamb 68 + "',1)">69 + score + "')"70 MysqL.datatoMysqL(sql); 71 System.out.println("存储数据成功:" + i + "条"72 73 74 bufferedReader.close(); 76 System.out.println("一共收集到了" + i + "条信息"); 77 } 78 79 80 82 83 }GetAllData.java Action.java 1 2 3 4 5 java.util.ArrayList; 6 java.util.List; 7 java.util.Vector; 8 9 Action { 10 11 12 * 操作一:初始化数据库数据 13 14 initData() { 15 String sql = "delete from football" 16 MysqL doMysqL = 17 18 doMysqL.datatoMysqL(sql); 19 System.out.println("数据初始化完毕!" 20 } 21 System.out.println("数据初始化失败!" 22 23 24 25 26 27 * 获取所有队伍信息 28 29 30 31 public Vector<String> getAllTeam() { 32 ResultSet rs = 33 Vector<String> vector = new Vector<String>(); 34 String sql = "select teama,teamb from football" 35 MysqL doMysqL = 36 rs = doMysqL.searchMysqL(sql); 37 38 39 while (rs.next()) { 40 41 if (!vector.contains(rs.getString("teama"))) { 42 vector.add(rs.getString("teama" 43 } 44 if (!vector.contains(rs.getString("teamb" 45 vector.add(rs.getString("teamb" 46 47 } 48 e.printStackTrace(); 49 50 51 } 52 53 54 55 vector; 56 57 58 59 60 * 获取具体某队的比赛信息 61 62 team 63 64 65 public List<String> findTeam(String team) { 66 List<String> list = new ArrayList<String> 67 String sql = "select * from football where teama ='" + 68 + "' or teamb ='" + team + "'" 69 MysqL MysqL = 70 ResultSet rs = 71 rs = MysqL.searchMysqL(sql); 72 73 74 list.add(rs.getString("date" 75 list.add(rs.getString("teama" 76 list.add(rs.getString("teamb" 77 list.add(rs.getString("score" 78 79 } 80 81 82 list; 83 84 85 86 findGame(String date) { 87 List<String> list = 88 ResultSet rs = 89 String sql = "select * from football where date ='" + date + "'" 90 MysqL MysqL = 91 rs = 92 93 94 list.add(rs.getString("date" 95 list.add(rs.getString("teama" 96 list.add(rs.getString("teamb" 97 list.add(rs.getString("score" 98 99 } 100 TODO Auto-generated catch block 101 102 103 104 105 106 }Action.java FootBallMain.java java.util.Scanner; FootBallMain { 主程序类 static main(String[] args) { 13 GetAllData allData = GetAllData(); 14 Action action = Action(); 15 16 while (true17 System.out.println("①初始化数据库-请按 (1)"18 System.out.println("②自动化采集数据-请按(2)"19 System.out.println("③查询参赛队伍-请按(3)"20 System.out.println("④查询具体球队比赛结果-请按(4)"21 System.out.println("⑤查询具体某天的比赛详情-请按(5)"22 Scanner scanner = Scanner(System.in); 23 String input = scanner.next(); 24 if (input.equals("1"25 System.out.println(); 26 action.initData(); 27 System.out 28 .println("-----------------------------------------------------"29 } if (input.equals("2"30 System.out.println("正在采集数据...请稍后" allData.getAllData(); 32 33 .println("-----------------------------------------------------"34 } if (input.equals("3"35 Vector<String> allTeam = action.getAllTeam(); 36 System.out.println("正在获取数据...请稍后"if (allTeam.size() != 038 System.out.println("参赛队伍如下:"39 for (int i = 0; i < allTeam.size(); i++40 System.out.println(allTeam.get(i)); 44 .println("-----------------------------------------------------"45 } if (input.equals("4"46 System.out.println("请输入您要查询的队伍名:"47 String team =48 List<String> list = action.findTeam(team); 49 System.out.println("比赛日期ttt主队tt客队ttt比赛结果"50 if (list.size() != 051 int i = 0; i < list.size(); i++52 System.out.print(list.get(i) + "tt"54 } 55 System.out.println("暂时没有您所提供队伍的比赛信息,敬请关注..."56 57 58 59 .println("-----------------------------------------------------"60 } if (input.equals("5"61 System.out.println("请输入您要查询日期(格式如下:xx.xx.xxxx):"62 String date =63 List<String> info = action.findGame(date); 64 System.out.println("比赛日期ttt主队tt客队ttt比赛结果"65 if (info.size() != 066 int i = 0; i < info.size(); i++67 if (i % 4 == 0 && i != 0 System.out.println(); } 70 System.out.print(info.get(i) + "tt"71 72 } 73 System.out.println("暂时没有您所提供的比赛信息,1)">77 .println("------------------------------------------------------------------------"78 } 79 System.out.println("请输入正确的对应编号.."80 81 .println("------------------------------------------------------------------------"83 84 85 86 }FootBallMain.java ? (编辑:北几岛) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |