%PDF- %PDF-
Direktori : /var/www/pn/utils/classes/ |
Current File : /var/www/pn/utils/classes/GizParser.php |
<?php namespace WebPappers\GizParser; use WebPappers\Parser\Parser; use WebPappers\GizProject\GizProject; use DiDom\Document; use Exception; class GizParser extends Parser { private $maxPages; public function getMaxPages() { return $this->maxPages; } public function setMaxPages($maxPages) { $this->maxPages = $maxPages; } public function buildPageUrl($pageId) { // $offset = $pageId * 20; // $url = 'https://search.worldbank.org/api/v2/projects?format=json&rows=20&fct=projectfinancialtype_exact,status_exact,regionname_exact,theme_exact,sector_exact,countryshortname_exact,cons_serv_reqd_ind_exact,esrc_ovrl_risk_rate_exact&fl=id,project_name,countryshortname,totalamt,status,boardapprovaldate,url,totalcommamt,proj_last_upd_date&srt=boardapprovaldate&apilang=en&os='.$offset; // return $url; } public function getProjects($url) { // $page = $this->getPageHtml($url); // if(!$page){ // return; // } // $data = json_decode($page); // if(empty($data->projects)){ // return false; // } // return $data->projects; } public function getProjectUrl($projectData) { // $url = 'https://search.worldbank.org/api/v2/projects?format=json&fl=*&id='.$projectData->id.'&apilang=en'; // return $url; } public function readPage($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_REFERER, $url); curl_setopt($ch, CURLOPT_POST, 0); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_COOKIEFILE, $_SERVER['DOCUMENT_ROOT'].'/utils/cookie.txt'); curl_setopt($ch, CURLOPT_COOKIEJAR, $_SERVER['DOCUMENT_ROOT'].'/utils/cookie.txt'); curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (Windows; U; Windows NT 5.0; En; rv:1.8.0.2) Gecko/20070306 Firefox/1.0.0.4"); $result = curl_exec($ch); curl_close($ch); return $result; } public function getAllProjectLinksByRegions() { $links = array( 'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=WW,AQ,DKT,ELN,IZR,KFS,KON,MFE,STL,URW,WWM,ZPS&searchParams.crsKey=&searchParams.client=&nextPage=', 'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=ADP,AG,AGW,AI,AMA,AME,AML,AMM,AMN,AMS,AMX,AN,AR,AW,BB,BM,BO,BR,BRJ,BS,BZ,CA,CAC,CAD,CDB,CL,CO,CR,CTT,CU,CW,DM,DO,EC,FIO,FK,FOI,GD,GF,GP,GT,GU,GY,HN,HT,JM,KAR,KN,KRP,KY,LC,MQ,MX,NI,OAS,OEC,OKI,OTA,PA,PE,PM,PR,PY,SIC,SR,SV,TT,US,UY,VC,VE,VG,VI&searchParams.crsKey=&searchParams.client=&nextPage=', 'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=ACS,AE,AF,ASE,ASN,ASS,BD,BH,BN,BT,CN,ESC,FOS,GUS,HK,ID,IDB,IL,IMA,IN,IQ,IR,JO,JP,KG,KH,KP,KR,KW,KZ,LA,LB,LK,MKD,MM,MN,MO,MOS,MRC,MV,MY,NP,OM,PF,PH,PK,PS,QA,SA,SAA,SEA,SG,SY,TH,TJ,TL,TM,TW,UZ,VN,YE,ZAS&searchParams.crsKey=&searchParams.client=&nextPage=', 'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=ABN,AFH,AFR,AFS,AO,AUK,BDE,BF,BI,BJ,BOA,BW,CBT,CD,CEA,CEM,CF,CG,CGL,CI,CIC,CIL,CM,CV,DJ,DZ,EAC,ECA,ECO,EG,ER,ET,FAC,FRS,GA,GH,GM,GN,GQ,GW,IAD,KE,KM,LR,LS,LY,MA,MAH,MG,ML,MR,MRU,MU,MW,MZ,NA,NE,NG,NON,OCA,OMV,RE,RW,SAD,SAH,SC,SD,SH,SL,SLS,SN,SO,SS,ST,SZ,TD,TG,TN,TZ,UG,ZA,ZM,ZW,EH,TF,NONE&searchParams.crsKey=&searchParams.client=&nextPage=', 'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=AD,AL,AM,AT,AZ,BA,BE,BG,BY,CH,CS,CY,CZ,DDR,DE,DK,EE,ES,EUM,EUR,FI,FO,FR,GB,GE,GG,GI,GL,GR,HR,HU,IE,IM,IS,IT,JE,KAU,KOS,LI,LT,LU,LV,MC,MD,ME,MK,MOE,MS,MT,NL,NO,PL,PT,RO,RS,RU,SE,SI,SK,SM,SOE,STA,TC,TR,UA,VA,XK&searchParams.crsKey=&searchParams.client=&nextPage=', 'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=AS,AU,CC,CK,CX,FJ,FM,KI,MH,MP,NC,NF,NR,NU,NZ,OZE,PG,PW,PZI,SB,SPC,TK,TO,TV,UM,VU,WF,WS&searchParams.crsKey=&searchParams.client=&nextPage=' ); return $links; } public function getAllJsonFiles() { $files = scandir(PARSER_PATH.'/giz_project_json'); foreach($files as $key => $file){ if('.' == $file || '..' == $file){ unset($files[$key]); } } return $files; } public function hasProjects($htmlDom) { $results = $htmlDom->find('ol.searchresultcontent .item'); $hasProjects = count($results) > 0 ? true : false; return $hasProjects; } public function keywordToQueryParameter($keyword) { $keyword = str_replace(' ', '%20', $keyword); return $keyword; } public function buildSearchUrl($keyword, $hits, $start) { $startParam = ''; if($start){ $startParam = '&start='.$start; } $keyword = $this->keywordToQueryParameter($keyword); $url = 'https://www.giz.de/search/?query='.$keyword.'&doc-types=html'.$startParam.'&hits='.$hits.'&countHits=true&site=www_giz_de_EN&hl=en_EN&loadTab=1'; return $url; } public function prepareProjectUrlFromObject($urlObjects) { $projectUrl = array(); foreach($urlObjects as $obj){ $projectUrl[] = $obj->href; } return $projectUrl; } public function getAllProjectUrlsByKeyword($keyword) { $urls = array(); $hasProjects = true; $hits = 50; $start = 0; $i = 0; while($hasProjects){ $start = $hits * $i; $searchResultUrl = $this->buildSearchUrl($keyword, $hits, $start); $resultDom = $this->getPageDom($searchResultUrl); $hasProjects = $this->hasProjects($resultDom); if($hasProjects){ $urlObjects = $resultDom->find('li.item a'); $projectUrls = $this->prepareProjectUrlFromObject($urlObjects); $urls = array_merge($urls, $projectUrls); } $i++; } return $urls; } public function isProjectUrl($url) { $neededWord = 'worldwide'; $isProject = stripos($url, $neededWord) === false ? false : true; return $isProject; } public function parse() { $kewords = $this->getKeywords(); foreach($kewords as $keyword){ $projectUrls = $this->getAllProjectUrlsByKeyword($keyword); foreach($projectUrls as $url){ $isProject = $this->isProjectUrl($url); if($isProject){ $projectDom = $this->getPageDom($url); $project = new GizProject($projectDom, $url); $additionalData = array(); $additionalData['keywords'] = $keyword; $project->parseDataFromRemote($additionalData); $project->save(); } } } } }