0375-3382118

可以设置电话、微信、qq,并显示不同的图标

php 爬数据库,php爬虫爬取数据并存储至数据库

admin9个月前56

  php爬虫爬取数据并存储至数据库

  准备:php环境,phpspider请自行下载

  下载地址:https://github.com/owner888/phpspider

  文档:https://doc.phpspider.org/configs-members.html

  在phpspider,demo文件里面新建一个php文件,my_spider.php,代码复制进去,在命令函下执行php my_spider.php能看到打印信息,数据库文件根据自己需要建一两个字段测试一下就好,这里就不给出数据库sql文件了

  直接上代码:

  require_once __DIR__ . '/https://blog.csdn.net/weixin_29577845/article/autoloader.php';

  use phpspidercore

equests;

  use phpspidercoreselector;

  use phpspidercoredb;

  use phpspidercorequeue;

  /* Do NOT delete this comment */

  /* 不要删除这段注释 */

  //数据库配置

  $db_config = array(

  'host' => '127.0.0.1',

  'port' => 3306,

  'user' => 'root',

  'pass' => 'root',

  'name' => 'news',

  );

  // 数据库配置

  $db = db::set_connect('default', $db_config);

  // 数据库链接

  //db::init_mysql();

  //redis配置

  $redis_config = array(

  'host' => '127.0.0.1',

  'port' => 6379,

  'pass' => '',

  'db' => 5,

  'prefix' => 'phpspider',

  'timeout' => 30,

  );

  queue::set_connect('redis',$redis_config);

  queue::select(5);

  //抓取虎扑网nba数据

  $html = requests::get('https://voice.hupu.com/nba');

  获取列表资讯链接

  $url = selector::select($html,"//div[@class='voice-main']//li//h4//a/@href");

  //$url = array_reverse($url);

  //定义数组

  $spider_data = [];

  foreach ($url as $key => $v){

  //通过连接抓取内部内容

  $detail_html = requests::get($v);

  //保存地址

  $spider_data[$key]['url'] = $v;

  //抓取标题

  $spider_data[$key]['title'] = selector::select($detail_html,"/html/body/div[4]/div[1]/div[1]/h1");;

  //获取封面图

  $spider_data[$key]['cover'] = selector::select($detail_html,"/html/body/div[4]/div[1]/div[2]/div/div[1]/img/@src");

  //图片数组

  $spider_data[$key]['imgs'] =selector::select($detail_html,"/html/body/div[4]/div[1]//img/@src");

  if(is_array($spider_data[$key]['imgs'])){

  $spider_data[$key]['imgs'] = json_encode($spider_data[$key]['imgs']);

  }else{

  $spider_data[$key]['imgs'] = json_encode([$spider_data[$key]['imgs']]);

  }

  //获取内容

  $spider_data[$key]['content'] = selector::select($detail_html,"/html/body/div[4]/div[1]/div[2]/div");

  //获取时间

  $spider_data[$key]['create_time'] = strtotime(selector::select($detail_html,"//*[@id="pubtime_baidu"]"));

  $spider_data[$key]['update_time'] = strtotime(selector::select($detail_html,"//*[@id="pubtime_baidu"]"));

  //获取来源

  $spider_data[$key]['source'] = selector::select($detail_html,"//*[@id="source_baidu"]/a");

  //频道id

  $spider_data[$key]['channel_id'] = 2;

  //状态

  $spider_data[$key]['status'] = 1;

  //类型

  $spider_data[$key]['type'] = 1;

  //随机获取作者id

  $admin_arr = array(23,24,25,26);

  $admin_key = array_rand($admin_arr);

  $spider_data[$key]['admin_id'] = $admin_arr[$admin_key];

  //tag_id nba,先匹配,再分类

  if(strpos($spider_data[$key]['content'],'采访') !== false || strpos($spider_data[$key]['content'],'记者') || strpos($spider_data[$key]['content'],'报道')){

  $spider_data[$key]['tag_id'] = 103;

  }else if(strpos($spider_data[$key]['content'],'伤') !== false){

  $spider_data[$key]['tag_id'] = 3;

  }else if(strpos($spider_data[$key]['content'],'签下') !== false || strpos($spider_data[$key]['content'],'签约') !== false || strpos($spider_data[$key]['content'],'引援') !== false ||

  strpos($spider_data[$key]['content'],'转会') !== false){

  $spider_data[$key]['tag_id'] = 1;

  }else{

  //随机获取分类

  $tag_arr = array(5,13);

  $tag_key = array_rand($tag_arr);

  $spider_data[$key]['tag_id'] = $tag_arr[$tag_key];

  }

  }

  var_dump($spider_data);exit;

  本博客爬取的是虎扑资讯信息,如果导致爬取目标网站瘫痪,与本博无关,不做任何责任申明,只为技术。

BC链 http://www.chinabic.com/?id=239 转载需授权!

网友评论