在路上

 找回密码
 立即注册
在路上 站点首页 学习 查看内容

嘎嘎,爬取小小编辑的动弹

2016-7-29 15:47| 发布者: zhangjf| 查看: 646| 评论: 0

摘要: 唉,压缩包居然不可以上传,一个个传上来 package MeiNvTuPian;public class DataProcessor { private static DataProcessor inst = new DataProcessor(); public PrintFile ...
唉,压缩包居然不可以上传,一个个传上来
  1. package MeiNvTuPian;
  2. public class DataProcessor {
  3. private static DataProcessor inst = new DataProcessor();
  4. public PrintFile printFile;
  5. public static DataProcessor inst(){return inst;}
  6. /**
  7. *
  8. * @param data
  9. * @param blogId
  10. */
  11. public void getBolgMsg(String data) {
  12. String str = data.substring(data.indexOf("title="原创博客"")+"title="原创博客"".length());
  13. str = str.substring(0, str.lastIndexOf("pages sm-hide")+"pages sm-hide".length());
  14. String[] blogs = str.split("title="原创博客"");
  15. for(String s : blogs){
  16. BlogModel bm = new BlogModel();
  17. BlogHtml bh = new BlogHtml();
  18. try {
  19. s = s.substring(s.indexOf("href="")+"href="".length());
  20. bm.blogUrl = s.substring(0, s.indexOf("">"));
  21. s = s.substring(s.indexOf("">")+"">".length());
  22. bm.title = s.substring(0, s.indexOf("</a>")).replaceAll(" ", "");
  23. s = s.substring(s.indexOf("time">")+"time">".length());
  24. bm.time = s.substring(0,s.indexOf("发布")).replaceAll(" ", "");
  25. this.getMoreData(bm);
  26. } catch (Exception e) {
  27. e.printStackTrace();
  28. }
  29. bh.p1 = "<p><a href=""+bm.blogUrl+"">("+bm.time+")"+bm.title+"</a></p>";
  30. if(bm.music == null || bm.music.equals("")){
  31. bh.p2 = "<p>本次动弹听歌,戳<a href="">(不用点了,这期没拿到)</a></p>";
  32. }else{
  33. bh.p2 = "<p>本次动弹听歌,戳<a href=""+bm.music+"">(这里)</a></p>";
  34. }
  35. bh.p3 = "<p>小树医生心理生理医务室&darr;&darr;&darr;</p>";
  36. if(bm.mmPhoto == null || bm.mmPhoto.equals("")){
  37. bh.p4 = "<p>本期弃疗</p>";
  38. }else{
  39. bh.p4 = "<p><img alt="福利" src=""+bm.mmPhoto+"" /></p>";
  40. }
  41. printFile.onData(bh);
  42. }
  43. }
  44. private void getMoreData(BlogModel bm) {
  45. String data = DataFetcher.inst().getData(bm.blogUrl);
  46. if(data.equals("")){
  47. System.out.println("动弹不存在!");
  48. return;
  49. }
  50. if(data.indexOf("手机党少年们想听歌,请使劲儿戳")>0){
  51. data = data.substring(data.indexOf("请使劲儿戳(<a href="")+"请使劲儿戳(<a href="".length());
  52. bm.music = data.substring(0,data.indexOf("""));
  53. }
  54. if(data.indexOf("小树医生心理生理医务室")>0){
  55. data = data.substring(data.indexOf("小树医生心理生理医务室")+"小树医生心理生理医务室".length());
  56. data = data.substring(data.indexOf("src="")+"src="".length());
  57. bm.mmPhoto = data.substring(0,data.indexOf("""));
  58. }
  59. }
  60. }
复制代码
  1. package MeiNvTuPian;
  2. import java.io.BufferedReader;
  3. import java.io.InputStreamReader;
  4. import java.net.URL;
  5. import java.net.URLConnection;
  6. public class DataFetcher {
  7. private static DataFetcher inst = new DataFetcher();
  8. public static DataFetcher inst() { return inst; }
  9. //模拟浏览器发出请求,并返回网页内容字符串
  10. public String getData(String urlStr)
  11. {
  12. String result = "";
  13. BufferedReader in = null;
  14. try {
  15. URL url = new URL(urlStr);
  16. System.out.println(urlStr);
  17. URLConnection connection = url.openConnection();
  18. connection.setRequestProperty("accept", "*/*");
  19. connection.setRequestProperty("connection", "Keep-Alive");
  20. connection.setRequestProperty("user-agent",
  21. "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11");
  22. connection.connect();
  23. in = new BufferedReader(new InputStreamReader(
  24. connection.getInputStream()));
  25. String line;
  26. while ((line = in.readLine()) != null) {
  27. result += line;
  28. }
  29. } catch (java.io.IOException e1)
  30. {
  31. e1.printStackTrace();
  32. }
  33. catch (Exception e) {
  34. System.out.println("get" + e);
  35. e.printStackTrace();
  36. }
  37. finally {
  38. try {
  39. if (in != null) {
  40. in.close();
  41. }
  42. } catch (Exception e2) {
  43. e2.printStackTrace();
  44. }
  45. }
  46. return result;
  47. }
  48. }
复制代码
  1. package MeiNvTuPian;
  2. public class Main {
  3. public static void main(String[] args) throws InterruptedException {
  4. int pageNum = 16;
  5. PrintFile p = new PrintFile();
  6. DataProcessor.inst().printFile = p;
  7. //p.init("blogMsg");
  8. p.init1("blogHtml");
  9. getBolgMsg(pageNum);
  10. Thread.sleep(5000);
  11. p.finish();
  12. }
  13. //处理数据
  14. private static void getBolgMsg(int pageNum) throws InterruptedException {
  15. for (int i = 1; i <= pageNum; i++)
  16. {
  17. System.out.println("page" + i);
  18. String data = DataFetcher.inst().getData("http://my.oschina.net/xxiaobian/blog?sort=time&p="+i);
  19. if(data.equals("")){
  20. break;
  21. }
  22. DataProcessor.inst().getBolgMsg(data);
  23. Thread.sleep(3000);//每获取一页数据线程睡眠3秒
  24. }
  25. }
  26. }
复制代码
  1. package MeiNvTuPian;
  2. import java.io.File;
  3. import java.io.FileWriter;
  4. import java.io.IOException;
  5. public class PrintFile {
  6. private FileWriter writer = null;
  7. //写入数据
  8. /* public void onData(BlogModel bm){
  9. try {
  10. writer.write(bm.dump()+ "n");
  11. writer.flush();
  12. } catch (IOException e) {
  13. e.printStackTrace();
  14. }
  15. }*/
  16. public void onData(BlogHtml bh){
  17. try {
  18. writer.write(bh.dump()+ "n");
  19. writer.flush();
  20. } catch (IOException e) {
  21. e.printStackTrace();
  22. }
  23. }
  24. //数据行头部写入
  25. public void init(String filename)
  26. {
  27. try {
  28. File f = new File("F:/"+filename);
  29. if (!f.exists())
  30. {
  31. f.mkdir();
  32. }
  33. writer = new FileWriter("F:/"+filename+"/"+filename+".txt");
  34. String title = "动弹url 标题 时间 歌 图片urln";
  35. writer.write(title);
  36. writer.flush();
  37. } catch (IOException e) {
  38. e.printStackTrace();
  39. }
  40. }
  41. //数据行头部写入
  42. public void init1(String filename)
  43. {
  44. try {
  45. File f = new File("F:/"+filename);
  46. if (!f.exists())
  47. {
  48. f.mkdir();
  49. }
  50. writer = new FileWriter("F:/"+filename+"/"+filename+".txt");
  51. String title = "p1 p2 p3 p4n";
  52. writer.write(title);
  53. writer.flush();
  54. } catch (IOException e) {
  55. e.printStackTrace();
  56. }
  57. }
  58. //结束
  59. public void finish()
  60. {
  61. System.out.println("done.");
  62. try {
  63. writer.close();
  64. } catch (IOException e) {
  65. e.printStackTrace();
  66. }
  67. }
  68. }
复制代码
  1. package MeiNvTuPian;
  2. public class BlogHtml {
  3. String p1;
  4. String p2;
  5. String p3;
  6. String p4;
  7. public String dump(){
  8. String seperator = " ";
  9. String ret = p1 + seperator
  10. + p2 + seperator
  11. + p3 + seperator
  12. + p4;
  13. return ret;
  14. }
  15. }
复制代码
  1. package MeiNvTuPian;
  2. /**
  3. *
  4. * @author Administrator
  5. * 对象类
  6. */
  7. public class BlogModel {
  8. String blogUrl;
  9. String title;
  10. String time;
  11. String music;
  12. String mmPhoto;
  13. public String dump(){
  14. String seperator = " ";
  15. String ret = blogUrl + seperator
  16. + title + seperator
  17. + time + seperator
  18. + music + seperator
  19. + mmPhoto;
  20. return ret;
  21. }
  22. }
复制代码

最新评论

小黑屋|在路上 ( 蜀ICP备15035742号-1 

;

GMT+8, 2025-5-6 15:42

Copyright 2015-2025 djqfx

返回顶部