写了段批量抓取某个列表页的东东

时间:2021-05-25

有些人当抓取程序是个宝,到目前还TND有人在卖钱,强烈BS一下这些家伙 真是的!可能偶下边这段东西比较烂哈
下边这个没有写入库功能,已经到这一步了,入库功能是很简单的事了,需要的请自己去完成吧,其它功能各位自行完善吧!把代码拷贝过去直接运行即可看到效果

DimUrl,List_PageCode,Array_ArticleID,i,ArticleID
DimContent_PageCode,Content_TempCode
DimContent_CategoryID,Content_CategoryName,BorderID,ClassID,BorderName,ClassName
DimArticleTitle,ArticleAuthor,ArticleFrom,ArticleContent

Url="http://www.webasp.net/article/class/1.htm"
List_PageCode=getHTTPPage(Url)
List_PageCode=RegExpText(List_PageCode,"打印</th></tr>","</table><tableborder=0cellpadding=5",0)
List_PageCode=RegExpText(List_PageCode,"<tdalign=left><ahref='../","'><imgborder=0src='../images/authortype0.gif'",1)'取得当前列表页的文章链接,以,分隔
Array_ArticleID=Split(List_PageCode,",")'创建数组,存储文章ID

Fori=0ToUbound(Array_ArticleID)-1
ArticleID=Array_ArticleID(i)'文章ID
Content_PageCode=getHTTPPage("http://www.webasp.net/article/"&ArticleID)'取得文章页的内容

'=========取文章分类及相关ID参数开始=======================
Content_TempCode=RegExpText(Content_PageCode,"<ahref=""/article/"">技术教程</a>&gt;&gt;","&gt;&gt;内容</td>",0)
Content_CategoryID=RegExpText(Content_PageCode,"<ahref='../class","/'>",1)
BorderID=Split(Content_CategoryID,",")(0)'大类ID
ClassID=Split(Content_CategoryID,",")(1)'子类ID
'==========检查大类是否存在开始===============
'如果不存在则入库

'==========检查大类是否存在结束===============
'Response.Write(BorderID&","&ClassID&"<br/>")
Content_CategoryName=RegExpText(Content_PageCode,"/'>","</a>",1)
BorderName=Split(Content_CategoryName,",")(0)'大类名称
ClassName=Split(Content_CategoryName,",")(1)'子类名称
'==========检查子类是否存在开始===============
'如果不存在则入库

'==========检查子类是否存在结束===============
'=========取文章分类及相关ID参数结束=======================

'=========取文章标题及内容开始=============================
ArticleTitle=RegExpText(Content_PageCode,"<tr><tdalign=centerbgcolor=#DEE2F5><strong>","</strong></td></tr>",0)
ArticleAuthor=RegExpText(Content_PageCode,"<tr><td><spanclass=blue>作者:</span>","</td></tr>",0)
ArticleFrom=RegExpText(Content_PageCode,"<tr><td><spanclass=blue>来源:</span>","</td></tr>",0)
ArticleContent=RegExpText(Content_PageCode,"<tr><tdclass=contentstyle=""WORD-WRAP:break-word""id=zoom>","</td></tr>"&VBCrlf&"</table>"&VBCrlf&"</td></tr></table>",0)
'=========取文章标题及内容结束=============================
Response.Write(ArticleTitle&"<br/><br/>")
Response.Flush()
Next


附几个函数:
FunctiongetHTTPPage(url)
IF(IsObjInstalled("Microsoft.XMLHTTP")=False)THEN
Response.Write"<br><br>服务器不支持Microsoft.XMLHTTP组件"
Err.Clear
Response.End
ENDIF
OnErrorResumeNext
Dimhttp
SEThttp=Server.CreateObject("Msxml2.XMLHTTP")
Http.open"GET",url,False
Http.send()
IF(Http.readystate<>4)THEN
ExitFunction
ENDIF
getHTTPPage=BytesToBSTR(Http.responseBody,"GB2312")
SEThttp=NOTHING
IF(Err.number<>0)THEN
Response.Write"<br><br>获取文件内容出错"
'Response.End
Err.Clear
ENDIF
EndFunction


FunctionBytesToBstr(CodeBody,CodeSet)
DimobjStream
SETobjStream=Server.CreateObject("adodb.stream")
objStream.Type=1
objStream.Mode=3
objStream.Open
objStream.WriteCodeBody
objStream.Position=0
objStream.Type=2
objStream.Charset=CodeSet
BytesToBstr=objStream.ReadText
objStream.Close
SETobjStream=NOTHING
EndFunction

'================================================
'作用:检查组件是否已经安装
'返回值:True----已经安装
'False----没有安装
'================================================
FunctionIsObjInstalled(objName)
OnErrorResumeNext
IsObjInstalled=False
Err=0
DimtestObj
SETtestObj=Server.CreateObject(objName)
IF(0=Err)THENIsObjInstalled=True
SETtestObj=NOTHING
Err=0
EndFunction

FunctionRegExpText(strng,strStart,strEnd,n)
DimregEx,Match,Matches,RetStr
SETregEx=NewRegExp
regEx.Pattern=strStart&"([\s\S]*?)"&strEnd
regEx.IgnoreCase=True
regEx.Global=True
SETMatches=regEx.Execute(strng)
ForEachMatchinMatches
IF(n=1)THEN
RetStr=RetStr&regEx.Replace(Match.Value,"$1")&","
ELSE
RetStr=RetStr&regEx.Replace(Match.Value,"$1")
ENDIF
Next
RegExpText=RetStr
SETregEx=NOTHING
EndFunction

声明:本页内容来源网络,仅供用户参考;我单位不保证亦不表示资料全面及准确无误,也不保证亦不表示这些资料为最新信息,如因任何原因,本网内容或者用户因倚赖本网内容造成任何损失或损害,我单位将不会负任何法律责任。如涉及版权问题,请提交至online#300.cn邮箱联系删除。

相关文章