在路上

 找回密码
 立即注册
在路上 站点首页 学习 查看内容

利用htmlunit下载网页上的文件

2016-12-20 13:16| 发布者: zhangjf| 查看: 700| 评论: 0

摘要: import java.io.FileOutputStream;import java.io.InputStream;import java.util.regex.Matcher;import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import com.gargoylesoftware.htmlunit.Pag ...
  1. import java.io.FileOutputStream;
  2. import java.io.InputStream;
  3. import java.util.regex.Matcher;
  4. import java.util.regex.Pattern;
  5. import org.apache.commons.io.IOUtils;
  6. import com.gargoylesoftware.htmlunit.Page;
  7. import com.gargoylesoftware.htmlunit.WebClient;
  8. public class DownloadFile {
  9. public static void main(String[] args) throws Exception {
  10. String baseUrl = "<a href="http://hanyu.iciba.com/hanzi/1.shtml";" target="_blank">http://hanyu.iciba.com/hanzi/1.shtml";</a>
  11. String bihuaRegex = "class="guanggao"[^<]*<[^<]*<param\s*name="movie"\s*value="([^"]*)";
  12. String aSoundRegex = "class="js12">ā.*?name="FlashVars"\s*value="f=([^"]*)";
  13. String eSoundRegex = "class="js12">ē.*?name="FlashVars"\s*value="f=([^"]*)";
  14. WebClient client = new WebClient();
  15. client.getOptions().setCssEnabled(false);
  16. client.getOptions().setJavaScriptEnabled(false);
  17. client.getOptions().setThrowExceptionOnFailingStatusCode(false);
  18. client.getOptions().setThrowExceptionOnScriptError(false);
  19. Page page = client.getPage(baseUrl);
  20. String source = page.getWebResponse().getContentAsString();
  21. Matcher mBihuan = Regex(source, bihuaRegex);
  22. Matcher mA = Regex(source, aSoundRegex);
  23. Matcher mE = Regex(source, eSoundRegex);
  24. while(mBihuan.find()) {
  25. String url = "<a href="http://hanyu.iciba.com/" + mBihuan.group" target="_blank">http://hanyu.iciba.com/" + mBihuan.group</a>(1);
  26. page = client.getPage(url);
  27. saveFile(page, "d:/testDownload/bihua.swf");
  28. }
  29. while(mA.find()) {
  30. String url = mA.group(1);
  31. page = client.getPage(url);
  32. saveFile(page, "d:/testDownload/a.mp3");
  33. }
  34. while(mE.find()) {
  35. String url = mE.group(1);
  36. page = client.getPage(url);
  37. saveFile(page, "d:/testDownload/e.mp3");
  38. }
  39. }
  40. public static Matcher Regex(String source, String regex) {
  41. Pattern p = Pattern.compile(regex, Pattern.DOTALL);
  42. return p.matcher(source);
  43. }
  44. public static void saveFile(Page page, String file) throws Exception {
  45. InputStream is = page.getWebResponse().getContentAsStream();
  46. FileOutputStream output = new FileOutputStream(file);
  47. IOUtils.copy(is, output);
  48. output.close();
  49. }
  50. }
复制代码

最新评论

小黑屋|在路上 ( 蜀ICP备15035742号-1 

;

GMT+8, 2025-7-8 18:07

Copyright 2015-2025 djqfx

返回顶部