亚洲国产精品无码久久大片,亚洲AV无码乱码麻豆精品国产,亚洲品质自拍网站,少妇伦子伦精品无码STYLES,国产精久久久久久久

全自動(dòng)文章采集、AI生成、自動(dòng)發(fā)布，網(wǎng)站自媒體全搞定！立即注冊

querylist采集微信公眾號文章( 處理跳轉向微信注入js的方法：以上就是對處理代理服務(wù)器攔截到的數據進(jìn)行處理)

優(yōu)采云發(fā)布時(shí)間: 2022-03-13 10:09

　　querylist采集微信公眾號文章(

處理跳轉向微信注入js的方法：以上就是對處理代理服務(wù)器攔截到的數據進(jìn)行處理)

　　public void getMsgExt(String str,String url) {

// TODO Auto-generated method stub

String biz = "";

String sn = "";

Map queryStrs = HttpUrlParser.parseUrl(url);

if(queryStrs != null){

biz = queryStrs.get("__biz");

biz = biz + "==";

sn = queryStrs.get("sn");

sn = "%" + sn + "%";

}

/**

* $sql = "select * from `文章表` where `biz`='".$biz."'

* and `content_url` like '%".$sn."%'" limit 0,1;

* 根據biz和sn找到對應的文章

*/

Post post = postMapper.selectByBizAndSn(biz, sn);

if(post == null){

System.out.println("biz:"+biz);

System.out.println("sn:"+sn);

tmpListMapper.deleteByLoad(1);

return;

}

// System.out.println("json數據:"+str);

Integer read_num;

Integer like_num;

try{

read_num = JsonPath.read(str, "['appmsgstat']['read_num']");//閱讀量

like_num = JsonPath.read(str, "['appmsgstat']['like_num']");//點(diǎn)贊量

}catch(Exception e){

read_num = 123;//閱讀量

like_num = 321;//點(diǎn)贊量

System.out.println("read_num:"+read_num);

System.out.println("like_num:"+like_num);

System.out.println(e.getMessage());

}

/**

* 在這里同樣根據sn在采集隊列表中刪除對應的文章，代表這篇文章可以移出采集隊列了

* $sql = "delete from `隊列表` where `content_url` like '%".$sn."%'"

*/

tmpListMapper.deleteBySn(sn);

//然后將閱讀量和點(diǎn)贊量更新到文章表中。

post.setReadnum(read_num);

post.setLikenum(like_num);

postMapper.updateByPrimaryKey(post);

}

　　將js注入微信的處理跳轉方法：

　　public String getWxHis() {

String url = "";

// TODO Auto-generated method stub

/**

* 當前頁(yè)面為公眾號歷史消息時(shí)，讀取這個(gè)程序

* 在采集隊列表中有一個(gè)load字段，當值等于1時(shí)代表正在被讀取

* 首先刪除采集隊列表中load=1的行

* 然后從隊列表中任意select一行

*/

tmpListMapper.deleteByLoad(1);

TmpList queue = tmpListMapper.selectRandomOne();

System.out.println("queue is null?"+queue);

if(queue == null){//隊列表為空

/**

* 隊列表如果空了，就從存儲公眾號biz的表中取得一個(gè)biz，

* 這里我在公眾號表中設置了一個(gè)采集時(shí)間的time字段，按照正序排列之后，

* 就得到時(shí)間戳最小的一個(gè)公眾號記錄，并取得它的biz

*/

WeiXin weiXin = weiXinMapper.selectOne();

String biz = weiXin.getBiz();

url = "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=" + biz +

"#wechat_redirect";//拼接公眾號歷史消息url地址（第二種頁(yè)面形式）

//更新剛才提到的公眾號表中的采集時(shí)間time字段為當前時(shí)間戳。

weiXin.setCollect(System.currentTimeMillis());

int result = weiXinMapper.updateByPrimaryKey(weiXin);

System.out.println("getHis weiXin updateResult:"+result);

}else{

//取得當前這一行的content_url字段

url = queue.getContentUrl();

//將load字段update為1

tmpListMapper.updateByContentUrl(url);

}

//將下一個(gè)將要跳轉的$url變成js腳本，由anyproxy注入到微信頁(yè)面中。

//echo "setTimeout(function(){window.location.href='".$url."';},2000);";

int randomTime = new Random().nextInt(3) + 3;

String jsCode = "setTimeout(function(){window.location.href='"+url+"';},"+randomTime*1000+");";

return jsCode;

}

　　以上是處理代理服務(wù)器截獲的數據的程序。這里有一個(gè)需要注意的問(wèn)題。程序會(huì )依次訪(fǎng)問(wèn)數據庫中每個(gè)收錄的公眾號，甚至會(huì )再次訪(fǎng)問(wèn)存儲的文章，以不斷更新收錄的閱讀點(diǎn)贊數@文章的。如果需要抓取大量公眾號，建議修改添加任務(wù)隊列和添加條件的代碼，否則多輪公眾號抓取重復數據的效率會(huì )受到很大影響。

　　至此，微信公眾號的文章鏈接全部被爬取完畢，且該鏈接為永久有效鏈接，可在瀏覽器中打開(kāi)。接下來(lái)就是編寫(xiě)爬蟲(chóng)程序，從數據庫中爬取鏈接文章的內容等信息。

　　我用webmagic寫(xiě)了一個(gè)爬蟲(chóng)，輕量級，好用。

　　public class SpiderModel implements PageProcessor{

private static PostMapper postMapper;

private static List posts;

// 抓取網(wǎng)站的相關(guān)配置，包括編碼、抓取間隔、重試次數等

private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

public Site getSite() {

// TODO Auto-generated method stub

return this.site;

}

public void process(Page page) {

// TODO Auto-generated method stub

Post post = posts.remove(0);

String content = page.getHtml().xpath("//div[@id='js_content']").get();

//存在和諧文章此處做判定如果有直接刪除記錄或設置表示位表示文章被和諧

if(content == null){

System.out.println("文章已和諧！");

//postMapper.deleteByPrimaryKey(post.getId());

return;

}

String contentSnap = content.replaceAll("data-src", "src").replaceAll("preview.html", "player.html");//快照

String contentTxt = HtmlToWord.stripHtml(content);//純文本內容

Selectable metaContent = page.getHtml().xpath("//div[@id='meta_content']");

String pubTime = null;

String wxname = null;

String author = null;

if(metaContent != null){

pubTime = metaContent.xpath("//em[@id='post-date']").get();

if(pubTime != null){

pubTime = HtmlToWord.stripHtml(pubTime);//文章發(fā)布時(shí)間

}

wxname = metaContent.xpath("//a[@id='post-user']").get();

if(wxname != null){

wxname = HtmlToWord.stripHtml(wxname);//公眾號名稱(chēng)

}

author = metaContent.xpath("//em[@class='rich_media_meta rich_media_meta_text' and @id!='post-date']").get();

if(author != null){

author = HtmlToWord.stripHtml(author);//文章作者

}

}

// System.out.println("發(fā)布時(shí)間:"+pubTime);

// System.out.println("公眾號名稱(chēng):"+wxname);

// System.out.println("文章作者:"+author);

String title = post.getTitle().replaceAll(" ", "");//文章標題

String digest = post.getDigest();//文章摘要

int likeNum = post.getLikenum();//文章點(diǎn)贊數

int readNum = post.getReadnum();//文章閱讀數

String contentUrl = post.getContentUrl();//文章鏈接

WechatInfoBean wechatBean = new WechatInfoBean();

wechatBean.setTitle(title);

wechatBean.setContent(contentTxt);//純文本內容

wechatBean.setSourceCode(contentSnap);//快照

wechatBean.setLikeCount(likeNum);

wechatBean.setViewCount(readNum);

wechatBean.setAbstractText(digest);//摘要

wechatBean.setUrl(contentUrl);

wechatBean.setPublishTime(pubTime);

wechatBean.setSiteName(wxname);//站點(diǎn)名稱(chēng) 公眾號名稱(chēng)

wechatBean.setAuthor(author);

wechatBean.setMediaType("微信公眾號");//來(lái)源媒體類(lèi)型

WechatStorage.saveWechatInfo(wechatBean);

//標示文章已經(jīng)被爬取

post.setIsSpider(1);

postMapper.updateByPrimaryKey(post);

}

public static void startSpider(List inposts,PostMapper myPostMapper,String... urls){

long startTime, endTime;

startTime = System.currentTimeMillis();

postMapper = myPostMapper;

posts = inposts;

HttpClientDownloader httpClientDownloader = new HttpClientDownloader();

SpiderModel spiderModel = new SpiderModel();

Spider mySpider = Spider.create(spiderModel).addUrl(urls);

mySpider.setDownloader(httpClientDownloader);

try {

SpiderMonitor.instance().register(mySpider);

mySpider.thread(1).run();

} catch (JMException e) {

e.printStackTrace();

}

endTime = System.currentTimeMillis();

System.out.println("爬取時(shí)間" + ((endTime - startTime) / 1000) + "秒--");

}

}

　　其他一些不相關(guān)的數據存儲代碼將不會(huì )發(fā)布。這里我將代理服務(wù)器抓取的數據存儲在mysql中，將我的爬蟲(chóng)爬取的數據存儲在mongodb中。

　　以下是我爬取的公眾號信息：

　　

　　

　　打開(kāi)應用程序并閱讀筆記

0

2022-03-13

querylist采集微信公眾號文章

0 個(gè)評論

要回復文章請先登錄或注冊

視
頻
教
程

官方客服QQ群

在
線(xiàn)
客
服

亚洲国产精品无码久久大片,亚洲AV无码乱码麻豆精品国产,亚洲品质自拍网站,少妇伦子伦精品无码STYLES,国产精久久久久久久