- 論壇徽章:
- 0
|
代碼是要實(shí)現(xiàn)批量的抓取網(wǎng)頁,由于數(shù)量太多,考慮到多進(jìn)程,可是運(yùn)行就提示內(nèi)存不能為read,該怎么寫這種多進(jìn)程的腳本呢?
還有采用多線程不知能否實(shí)現(xiàn),本人嘗試多線程時,線程執(zhí)行時并沒有提交cookies。- use LWP::UserAgent;
- use HTTP::Cookies;
- use LWP::ConnCache;
- use strict;
- my $ua=LWP::UserAgent->new;
- my $cache=LWP::ConnCache->new();
- $ua->conn_cache($cache);
- my $user="yongchun_wsu";
- my $password="trial01";
- my $url="https://portal.biobase-international.com/cgi-bin/portal/login.cgi";
- my $cookie=HTTP::Cookies->new(file=>'lwp_cookie.dat',autosave=>1,ignore_discard=>1);
- $ua->cookie_jar($cookie);
- my $res=$ua->post("$url",[login=>"$user",
- password=>"$password"]) or die "can't login $!";
- print "login successed !\n";
- for(my $count=1;$count<10;$count++){
- my $c=fork();
- if($c){
- print "parent running \n";
- }
- else
- {
- &getpage();
- exit 0;
- }
- }
- sub getpage {
- my $url1="https://portal.biobase-international.com/cgi-bin/build_t/idb/1.0/pageview.cgi?view=MatrixReport&matrix_acc=M0";
- for(my $i=1;$i<1357;$i++){
- my $id=sprintf("%04d",$i);
- if(! -e "M0".$id.".html"){
- my $bro=$ua->get($url1.$id);
- if($bro->is_success){
- open(FH,">","M0".$id.".html");
- print FH $bro->content;
- print "page M0$id".".html finished\n";
- close(FH);
-
- }
- else
- {
- print "can't get page $id";
- redo;
- }
- }
- }
- }
復(fù)制代碼 |
|