ter = array_value($config, 'filter');
$arr = array_value($filter, $type);
$enable = array_value($arr, 'enable');
$wordarr = array_value($arr, 'keyword');
if (0 == $enable || empty($wordarr)) return FALSE;
foreach ($wordarr as $_keyword) {
if (!$_keyword) continue;
$r = strpos(strtolower($keyword), strtolower($_keyword));
if (FALSE !== $r) {
$error = $_keyword;
return TRUE;
}
}
return FALSE;
}
// return http://domain.com OR https://domain.com
function url_prefix()
{
$http = ((isset($_SERVER['HTTPS']) && 'on' == $_SERVER['HTTPS']) || (isset($_SERVER['HTTP_X_FORWARDED_PROTO']) && $_SERVER['HTTP_X_FORWARDED_PROTO'] == 'https')) ? 'https://' : 'http://';
return $http . $_SERVER['HTTP_HOST'];
}
// 唯一身份ID
function uniq_id()
{
return uniqid(substr(md5(microtime(true) . mt_rand(1000, 9999)), 8, 8));
}
// 生成订单号 14位
function trade_no()
{
$trade_no = str_replace('.', '', microtime(1));
$strlen = mb_strlen($trade_no, 'UTF-8');
$strlen = 14 - $strlen;
$str = '';
if ($strlen) {
for ($i = 0; $i <= $strlen; $i++) {
if ($i < $strlen) $str .= '0';
}
}
return $trade_no . $str;
}
// 生成订单号 16位
function trade_no_16()
{
$explode = explode(' ', microtime());
$trade_no = $explode[1] . mb_substr($explode[0], 2, 6, 'UTF-8');
return $trade_no;
}
// 当前年的天数
function date_year($time = NULL)
{
$time = intval($time) ? $time : time();
return date('L', $time) + 365;
}
// 当前年份中的第几天
function date_z($time = NULL)
{
$time = intval($time) ? $time : time();
return date('z', $time);
}
// 当前月份中的第几天,没有前导零 1 到 31
function date_j($time = NULL)
{
$time = intval($time) ? $time : time();
return date('j', $time);
}
// 当前月份中的第几天,有前导零的2位数字 01 到 31
function date_d($time = NULL)
{
$time = intval($time) ? $time : time();
return date('d', $time);
}
// 当前时间为星期中的第几天 数字表示 1表示星期一 到 7表示星期天
function date_w_n($time = NULL)
{
$time = intval($time) ? $time : time();
return date('N', $time);
}
// 当前日第几周
function date_d_w($time = NULL)
{
$time = intval($time) ? $time : time();
return date('W', $time);
}
// 当前几月 没有前导零1-12
function date_n($time = NULL)
{
$time = intval($time) ? $time : time();
return date('n', $time);
}
// 当前月的天数
function date_t($time = NULL)
{
$time = intval($time) ? $time : time();
return date('t', $time);
}
// 0 o'clock on the day
function clock_zero()
{
return strtotime(date('Ymd'));
}
// 24 o'clock on the day
function clock_twenty_four()
{
return strtotime(date('Ymd')) + 86400;
}
// 8点过期 / expired at 8 a.m.
function eight_expired($time = NULL)
{
$time = intval($time) ? $time : time();
// 当前时间大于8点则改为第二天8点过期
$life = date('G') <= 8 ? (strtotime(date('Ymd')) + 28800 - $time) : clock_twenty_four() - $time + 28800;
return $life;
}
// 24点过期 / expired at 24 a.m.
function twenty_four_expired($time = NULL)
{
$time = intval($time) ? $time : time();
$twenty_four = clock_twenty_four();
$life = $twenty_four - $time;
return $life;
}
/**
* @param $url 提交地址
* @param string $post POST数组 / 空为GET获取数据 / $post='GET'获取连续跳转最终URL
* @param string $cookie cookie
* @param int $timeout 超时
* @param int $ms 设为1是毫秒
* @return mixed 返回数据
*/
function https_request($url, $post = '', $cookie = '', $timeout = 30, $ms = 0)
{
if (empty($url)) return FALSE;
if (version_compare(PHP_VERSION, '5.2.3', '<')) {
$ms = 0;
$timeout = 30;
}
is_array($post) and $post = http_build_query($post);
// 没有安装curl 使用http的形式,支持post
if (!extension_loaded('curl')) {
//throw new Exception('server not install CURL');
if ($post) {
return https_post($url, $post, $cookie, $timeout);
} else {
return http_get($url, $cookie, $timeout);
}
}
is_array($cookie) and $cookie = http_build_query($cookie);
$curl = curl_init();
// 返回执行结果,不输出
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
//php5.5跟php5.6中的CURLOPT_SAFE_UPLOAD的默认值不同
if (class_exists('\CURLFile')) {
curl_setopt($curl, CURLOPT_SAFE_UPLOAD, true);
} else {
defined('CURLOPT_SAFE_UPLOAD') and curl_setopt($curl, CURLOPT_SAFE_UPLOAD, false);
}
// 设定请求的RUL
curl_setopt($curl, CURLOPT_URL, $url);
// 设定返回信息中包含响应信息头
if (ini_get('safe_mode') && ini_get('open_basedir')) {
// $post参数必须为GET
if ('GET' == $post) {
// 安全模式时将头文件的信息作为数据流输出
curl_setopt($curl, CURLOPT_HEADER, true);
// 安全模式采用连续抓取
curl_setopt($curl, CURLOPT_NOBODY, true);
}
} else {
curl_setopt($curl, CURLOPT_HEADER, false);
// 允许跳转10次
curl_setopt($curl, CURLOPT_MAXREDIRS, 10);
// 使用自动跳转,返回最后的Location
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
}
$ua1 = 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1';
$ua = empty($_SERVER["HTTP_USER_AGENT"]) ? $ua1 : $_SERVER["HTTP_USER_AGENT"];
curl_setopt($curl, CURLOPT_USERAGENT, $ua);
// 兼容HTTPS
if (FALSE !== stripos($url, 'https://')) {
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
//ssl版本控制
//curl_setopt($curl, CURLOPT_SSLVERSION, CURL_SSLVERSION_TLSv1);
curl_setopt($curl, CURLOPT_SSLVERSION, true);
}
$header = array('Content-type: application/x-www-form-urlencoded;charset=UTF-8', 'X-Requested-With: XMLHttpRequest');
$cookie and $header[] = "Cookie: $cookie";
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
if ($post) {
// POST
curl_setopt($curl, CURLOPT_POST, true);
// 自动设置Referer
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_POSTFIELDS, $post);
}
if ($ms) {
curl_setopt($curl, CURLOPT_NOSIGNAL, true); // 设置毫秒超时
curl_setopt($curl, CURLOPT_TIMEOUT_MS, intval($timeout)); // 超时毫秒
} else {
curl_setopt($curl, CURLOPT_TIMEOUT, intval($timeout)); // 秒超时
}
//优先解析 IPv6 超时后IPv4
//curl_setopt($curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4);
curl_setopt($curl, CURLOPT_ENCODING, 'gzip');
// 返回执行结果
$output = curl_exec($curl);
// 有效URL,输出URL非URL页面内容 CURLOPT_RETURNTRANSFER 必须为false
'GET' == $post and $output = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL);
curl_close($curl);
return $output;
}
function save_image($img)
{
$ch = curl_init();
// 设定请求的RUL
curl_setopt($ch, CURLOPT_URL, $img);
// 设定返回信息中包含响应信息头 启用时会将头文件的信息作为数据流输出
//curl_setopt($ch, CURLOPT_HEADER, false);
//curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER["HTTP_USER_AGENT"]);
// true表示$html,false表示echo $html
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
//curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1);
//curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 0);
curl_setopt($ch, CURLOPT_ENCODING, 'gzip');
$output = curl_exec($ch);
curl_close($ch);
return $output;
}
// 计算字串宽度:剧中对齐(字体大小/字串内容/字体链接/背景宽度/倍数)
function calculate_str_width($size, $str, $font, $width, $multiple = 2)
{
$box = imagettfbbox($size, 0, $font, $str);
return ($width - $box[4] - $box[6]) / $multiple;
}
// 搜索目录下的文件 比对文件后缀
function search_directory($path)
{
if (is_dir($path)) {
$paths = scandir($path);
foreach ($paths as $val) {
$sub_path = $path . '/' . $val;
if ('.' == $val || '..' == $val) {
continue;
} else if (is_dir($sub_path)) {
//echo '目录名:' . $val . '
';
search_directory($sub_path);
} else {
//echo ' 最底层文件: ' . $path . '/' . $val . '
';
$ext = strtolower(file_ext($sub_path));
if (in_array($ext, array('php', 'asp', 'jsp', 'cgi', 'exe', 'dll'), TRUE)) {
echo '异常文件:' . $sub_path . '
';
}
}
}
}
}
// 一维数组转字符串 $sign待签名字符串 $url为urlencode转码GET参数字符串
function array_to_string($arr, &$sign = '', &$url = '')
{
if (count($arr) != count($arr, 1)) throw new Exception('Does not support multi-dimensional array to string');
// 注销签名
unset($arr['sign']);
// 排序
ksort($arr);
reset($arr);
// 转字符串做签名
$url = '';
$sign = '';
foreach ($arr as $key => $val) {
if (empty($val) || is_array($val)) continue;
$url .= $key . '=' . urlencode($val) . '&';
$sign .= $key . '=' . $val . '&';
}
$url = substr($url, 0, -1);
$url = htmlspecialchars($url);
$sign = substr($sign, 0, -1);
}
// 私钥生成签名
function rsa_create_sign($data, $key, $sign_type = 'RSA')
{
if (!function_exists('openssl_sign')) throw new Exception('OpenSSL extension is not enabled');
if (!defined('OPENSSL_ALGO_SHA256')) throw new Exception('Only versions above PHP 5.4.8 support SHA256');
$key = wordwrap($key, 64, "\n", true);
if (FALSE === $key) throw new Exception('Private Key Error');
$key = "-----BEGIN RSA PRIVATE KEY-----\n$key\n-----END RSA PRIVATE KEY-----";
if ('RSA2' == $sign_type) {
openssl_sign($data, $sign, $key, OPENSSL_ALGO_SHA256);
} else {
openssl_sign($data, $sign, $key, OPENSSL_ALGO_SHA1);
}
// 加密
return base64_encode($sign);
}
// 公钥验证签名
function rsa_verify_sign($data, $sign, $key, $sign_type = 'RSA')
{
$key = wordwrap($key, 64, "\n", true);
if (FALSE === $key) throw new Exception('Public Key Error');
$key = "-----BEGIN PUBLIC KEY-----\n$key\n-----END PUBLIC KEY-----";
// 签名正确返回1 签名不正确返回0 错误-1
if ('RSA2' == $sign_type) {
$result = openssl_verify($data, base64_decode($sign), $key, OPENSSL_ALGO_SHA256);
} else {
$result = openssl_verify($data, base64_decode($sign), $key, OPENSSL_ALGO_SHA1);
}
return $result === 1;
}
// Array to xml array('appid' => 'appid', 'code' => 'success')
function array_to_xml($arr)
{
if (!is_array($arr) || empty($arr)) throw new Exception('Array Error');
$xml = "";
foreach ($arr as $key => $val) {
if (is_numeric($val)) {
$xml .= "<" . $key . ">" . $val . "" . $key . ">";
} else {
$xml .= "<" . $key . ">" . $key . ">";
}
}
$xml .= "";
return $xml;
}
// Xml to array
function xml_to_array($xml)
{
if (!$xml) throw new Exception('XML error');
$old = libxml_disable_entity_loader(true);
// xml解析
$result = (array)simplexml_load_string($xml, null, LIBXML_NOCDATA | LIBXML_COMPACT);
// 恢复旧值
if (FALSE === $old) libxml_disable_entity_loader(false);
return $result;
}
// 逐行读取
function well_import($file)
{
if ($handle = fopen($file, 'r')) {
while (!feof($handle)) {
yield trim(fgets($handle));
}
fclose($handle);
}
}
// 计算总行数
function well_import_total($file, $key = 'well_import_total')
{
static $cache = array();
if (isset($cache[$key])) return $cache[$key];
$count = cache_get($key);
if (NULL === $count) {
$count = 0;
$globs = well_import($file);
while ($globs->valid()) {
++$count;
$globs->next(); // 指向下一个
}
$count and cache_set($key, $count, 300);
}
return $cache[$key] = $count;
}
$g_dir_file = FALSE;
function well_search_dir($path)
{
global $g_dir_file;
FALSE === $g_dir_file and $g_dir_file = array();
if (is_dir($path)) {
$paths = scandir($path);
foreach ($paths as $val) {
$sub_path = $path . '/' . $val;
if ('.' == $val || '..' == $val) {
continue;
} else if (is_dir($sub_path)) {
well_search_dir($sub_path);
} else {
$g_dir_file[] = $sub_path;
}
}
}
return $g_dir_file;
}
?>大数据项目:职务分析(一)——数据获取-阿南达文事网
大数据项目:职务分析(一)——数据获取
编程日记630
更新时间:2025-05-10 13:16:44
大数据项目:职务分析(一)——数据获取
项目介绍:该项目适合学习的时候使用,因为项目比较小,主要目的对猎聘当中的各个岗位的数据的获取和简单的分析,从多个方面分析岗位之间的关系以及薪资的差异。
采用的技术有:
python爬虫:
hadoop:hdfs存储数据
hive on spark : 进行数据分析
sqoop: 将分析的结果传输到关系型数据库当中
superset:进行数据的可视化
首先是将数据从猎聘官网当中获取:
爬取技术一栏当中的似是一个岗位对应的数据。
先获得各个岗位的url,进行跳转,在每个网页当获取有用的信息:比如:岗位,地址,薪资,公司规模,要求掌握的技术,学历要求和经验要求,最后,对这一页的数据爬取完之后,进行跳转,通过find-element-by-xpath(),来锁定到下一页的链接上,跳转到下一页再进行数据的爬取,如此往复,从而,得到所有的想要的数据。
话不多说,代码实现为:
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import Options
from lxml import etree
import osdef share_brower():chrome_options = Options()# chrome_options.add_argument('--headless') # 来判断浏览器的前后台运行,有图形化可以更好的展现她的活动chrome_options.add_argument('--disable-gpu')path='C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'chrome_options.binary_location = pathbrower = webdriver.Chrome(chrome_options=chrome_options)return browerdef save(source, number, name1):tree = etree.HTML(source)position = tree.xpath('//ul/li//div[@class="job-title-box"]/div[1]/text()')addr = tree.xpath('//ul/li//div[@class="job-title-box"]/div[2]/span[2]/text()')salary = tree.xpath('//ul/li//div[@class="job-detail-header-box"]/span/text()')company = tree.xpath('//ul/li//div[@class="job-company-info-box"]/span/text()')scale = tree.xpath('//ul/li//div[@class="job-company-info-box"]/div[@class="company-tags-box ellipsis-1"]/span[last()]/text()')experience = tree.xpath('//ul/li//div[@class="job-labels-box"]/span[1]/text()')xueli = tree.xpath('//ul/li//div[@class="job-labels-box"]/span[2]/text()')keyword = tree.xpath('//ul/li//div[@class="job-labels-box"]/span/text()')mi = min(len(position), len(addr), len(salary), len(company), len(scale), len(xueli), len(experience))with open('./date/' + name1.strip() + "/" + str(number) + '.csv', 'w', encoding='utf-8') as fs:for l in range(mi):new = position[l] + ',' + addr[l] + ',' + salary[l] + ',' + company[l] + ',' + scale[l]+','+experience[l]+','+xueli[l]+'\t\n'fs.write(new)fs.close()with open('./keyword.txt', 'a', encoding='utf-8') as fs:ne = ''for i in keyword:ne = ne + i + ' 'fs.write(ne)fs.close()base_url = ''
brower = share_brower()
brower.get('/')
brower.implicitly_wait(3)
page = brower.page_source
tree = etree.HTML(page)
name = tree.xpath('//ul[@class="sidebar float-left"]/li[1]//dd/a/text()')
url = tree.xpath('//ul[@class="sidebar float-left"]/li[1]//dd/a/@href')
for i in range(len(name)):if not os.path.exists('./date/'+name[i]):os.mkdir('./date/'+name[i]) #创建文件夹brower.get(base_url+url[i])brower.implicitly_wait(3)source = brower.page_sourcenumber = 1save(source, number, name[i])print(name[i])try:for j in range(9):element = brower.find_element_by_xpath('//div[@class="list-pagination-box"]//li[last()]/a')element.click()save(brower.page_source, number, name[i])number += 1except RuntimeError:print("*"*30+"有错误,但是可以执行的哦!!")continueelse:print("文件已经存在")os.rmdir('./date/'+name[i])continue
## //ul[@class="sidebar float-left"]/li[1]//dd/a/text() 相关职业
# //ul[@class="sidebar float-left"]/li[1]//dd/a/@href 对应的连接 每个连接底下都有十个页面 、爬取当中的数据
# 数据的存放 总共有49个类别的技术岗位 分别放在49个问价夹底下,文件夹以对应的职业命名 底下十个文件,每个文件表示每一页的数据
# ,文件的命名方式以1-10.csv ,保存的时候中间以逗号隔开,保存当当前的路径底下,然后爬取成功之后同意上传到大数据集
# 群的本地文件夹下面# //ul/li//div[@class="job-title-box"]/div[1]/text() 职位
# //ul/li//div[@class="job-title-box"]/div[2]/span[2]/text() 地址
# //ul/li//div[@class="job-detail-header-box"]/span/text() 薪资
# //ul/li//div[@class="job-company-info-box"]/span/text() 企业
# //ul/li//div[@class="job-company-info-box"]/div[@class="company-tags-box ellipsis-1"]
# /span[last()]/text() 公司规模
# //ul/li//div[@class="job-labels-box"]/span[1]/text() 工作经验
# //ul/li//div[@class="job-labels-box"]/span[2]/text() 招聘学历要求
# //ul/li//div[@class="job-labels-box"]/span/text() //用正则将数据的后序删除掉,或者在hadoop当中处理
# //div[@class="list-pagination-box"]//li[last()] 下一页的标签 循环九次brower.quit()
# 最后退出
最后结果为:
每一层和里面的数据保存形式,都如上所述,后序通过简单的mapreduce实现数据的处理,上传至hdfs当中,下期继续。。。。
本文发布于:2024-11-10,感谢您对本站的认可!
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:大数据项目职务分析(一)数据获取
发布评论