今天闲聊群有朋友问我怎么判断今天是不是法定节假日,是上班还是放假还是假期补班,想了想应该有相应的api,直接去调用就行了,但也可以去国务院官网发的放假通知上面去抓取,今天就跟大家分享下PHP实现爬虫抓取页面。 说到爬虫,大家首先想到的是py
今天闲聊群有朋友问我怎么判断今天是不是法定节假日,是上班还是放假还是假期补班,想了想应该有相应的api,直接去调用就行了,但也可以去国务院官网发的放假通知上面去抓取,今天就跟大家分享下PHP实现爬虫抓取页面。
说到爬虫,大家首先想到的是python,其实什么语言都一样,无非是抓取网页数据,然后正则分析网页结构,把想要的信息取出来,更深层次的无非是递归爬取所有链接,数据库操作等,php做爬虫也不算稀奇,今天这个需求很简单,就是抓取一篇放假通知的文章,从里面整理出法定节假日期间哪几天放假,哪几天补班,整理成规则的数组或者JSON,主要用到curl和正则。
下面还是先上代码
getHolidays();}public function getHolidays(){$data=[];$html=$this->curlGet("Http://www.Gov.cn/zhenGCe/content/2021-10/25/content_5644835.htm");preg_match_all('//s',$html,$pHandel);foreach($pHandel[0] as $key=>&$value){if(preg_match('/(共[1-9]天)/s',$value,$tempHandel)==1){$value=strip_tags($value);$value=explode("。",$value);$holidayName=explode("、",explode(":",$value[0])[0])[1];$start=preg_replace("/(年|月)/", "-",explode("日至",explode("、",explode(":",$value[0])[1])[0])[0]);if(strlen($start)<7){$start=date("Y")."-".$start;}$start=strtotime($start);$length=(int)preg_replace("/(共|天)/","",$tempHandel[0]);for($i=0;$i<$length;$i++){$item=[];$item["holidayName"]=$holidayName;$item["type"]="休假";$item["date"]=date("Y-m-d",$start+$i*86400+1);$data[]=$item;}if(count($value)==3){$value[1]=explode("、",$value[1]);foreach($value[1] as $ke=>&$val){$val=date("y")."-".explode("日(",str_replace("月", "-",$val))[0];$item=[];$item["holidayName"]=$holidayName."补班";$item["type"]="正常上班";$item["date"]=$val;$data[]=$item;}}}else{}}//echo json_encode($data,JSON_UNESCAPED_SLASHES|JSON_UNESCAPED_UNICODE);$this->dump($data);}private function curlGet($url, $username=null, $passWord=null) {$ch = curl_init();$header = array('Content-Type: text/xml',);curl_setopt($ch, CURLOPT_HTTPHEADER, $header);curl_setopt($ch, CURLOPT_URL, $url);curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "GET");curl_setopt($ch, CURLOPT_TIMEOUT, 30); //30秒超时curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);curl_setopt($ch, CURLOPT_HTTPAUTH, CURLAUTH_ANY);curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);if(!is_null($username)&&!is_null($password)){curl_setopt($ch, CURLOPT_USERPWD, "$username:$password");}$status_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); //get status code$result = curl_exec($ch);if (curl_error($ch)) {//curl_error($ch);//错误原因curl_close($ch);return false;} else {curl_close($ch);return $result;}}private function dump($data){echo "
";var_dump($data);echo "
";}}$c=new calendar();
也很简单,就是curl请求抓取网页源码,想尽一切办法把自己想要的数据抠出来,可以自己用正则匹配,可以用dom类,也可以用第三方库,我这个需求比较简单,就自己用正则和字符串操作把内容转换成数组,不得不说php的精髓就是字符串和数组,尤其是数组,一个array实现了各种类型,字符串的函数也很方便,很多都是直接调用C库函数。
运行了一下,爬虫爬到了放假和补班的日期数据,如下:
array(38) { [0]=> array(3) { ["holidayName"]=> string(6) "元旦" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-01-01" } [1]=> array(3) { ["holidayName"]=> string(6) "元旦" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-01-02" } [2]=> array(3) { ["holidayName"]=> string(6) "元旦" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-01-03" } [3]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-01-31" } [4]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-01" } [5]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-02" } [6]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-03" } [7]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-04" } [8]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-05" } [9]=> array(3) { ["holidayName"]=> string(6) "春节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-02-06" } [10]=> array(3) { ["holidayName"]=> string(12) "春节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(7) "22-1-29" } [11]=> array(3) { ["holidayName"]=> string(12) "春节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(7) "22-1-30" } [12]=> array(3) { ["holidayName"]=> string(9) "清明节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-04-03" } [13]=> array(3) { ["holidayName"]=> string(9) "清明节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-04-04" } [14]=> array(3) { ["holidayName"]=> string(9) "清明节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-04-05" } [15]=> array(3) { ["holidayName"]=> string(15) "清明节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(6) "22-4-2" } [16]=> array(3) { ["holidayName"]=> string(9) "劳动节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-04-30" } [17]=> array(3) { ["holidayName"]=> string(9) "劳动节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-05-01" } [18]=> array(3) { ["holidayName"]=> string(9) "劳动节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-05-02" } [19]=> array(3) { ["holidayName"]=> string(9) "劳动节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-05-03" } [20]=> array(3) { ["holidayName"]=> string(9) "劳动节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-05-04" } [21]=> array(3) { ["holidayName"]=> string(15) "劳动节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(7) "22-4-24" } [22]=> array(3) { ["holidayName"]=> string(15) "劳动节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(6) "22-5-7" } [23]=> array(3) { ["holidayName"]=> string(9) "端午节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-06-03" } [24]=> array(3) { ["holidayName"]=> string(9) "端午节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-06-04" } [25]=> array(3) { ["holidayName"]=> string(9) "端午节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-06-05" } [26]=> array(3) { ["holidayName"]=> string(9) "中秋节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-09-10" } [27]=> array(3) { ["holidayName"]=> string(9) "中秋节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-09-11" } [28]=> array(3) { ["holidayName"]=> string(9) "中秋节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-09-12" } [29]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-01" } [30]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-02" } [31]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-03" } [32]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-04" } [33]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-05" } [34]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-06" } [35]=> array(3) { ["holidayName"]=> string(9) "国庆节" ["type"]=> string(6) "休假" ["date"]=> string(10) "2022-10-07" } [36]=> array(3) { ["holidayName"]=> string(15) "国庆节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(7) "22-10-8" } [37]=> array(3) { ["holidayName"]=> string(15) "国庆节补班" ["type"]=> string(12) "正常上班" ["date"]=> string(7) "22-10-9" }}
来源地址:https://blog.csdn.net/sdxjwkq01/article/details/127621947
--结束END--
本文标题: php实现爬虫抓取法定节假日放假和补班安排数据
本文链接: https://www.lsjlt.com/news/386327.html(转载时请注明来源链接)
有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341
下载Word文档到电脑,方便收藏和打印~
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
回答
回答
回答
回答
回答
回答
回答
回答
回答
回答
0