%PDF- %PDF-
Mini Shell

Mini Shell

Direktori : /var/www/pn/utils/classes/
Upload File :
Create Path :
Current File : /var/www/pn/utils/classes/GizParser.php

<?php
namespace WebPappers\GizParser;

use WebPappers\Parser\Parser;
use WebPappers\GizProject\GizProject;
use DiDom\Document;
use Exception;

class GizParser extends Parser
{
    private $maxPages;

    public function getMaxPages()
    {
        return $this->maxPages;
    }

    public function setMaxPages($maxPages)
    {
        $this->maxPages = $maxPages;
    }

    public function buildPageUrl($pageId)
    {    
        // $offset = $pageId * 20;
        // $url = 'https://search.worldbank.org/api/v2/projects?format=json&rows=20&fct=projectfinancialtype_exact,status_exact,regionname_exact,theme_exact,sector_exact,countryshortname_exact,cons_serv_reqd_ind_exact,esrc_ovrl_risk_rate_exact&fl=id,project_name,countryshortname,totalamt,status,boardapprovaldate,url,totalcommamt,proj_last_upd_date&srt=boardapprovaldate&apilang=en&os='.$offset;
        
        // return $url;
    }
    
    public function getProjects($url)
    {
        // $page = $this->getPageHtml($url);
        // if(!$page){
        //     return;
        // }
     
        // $data = json_decode($page);
        // if(empty($data->projects)){
        //    return false; 
        // }

        // return $data->projects;
    }

    public function getProjectUrl($projectData)
    {
        // $url = 'https://search.worldbank.org/api/v2/projects?format=json&fl=*&id='.$projectData->id.'&apilang=en';

        // return $url;
    }

    public function readPage($url)
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);    
        curl_setopt($ch, CURLOPT_REFERER, $url);    
        curl_setopt($ch, CURLOPT_POST, 0);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_COOKIEFILE, $_SERVER['DOCUMENT_ROOT'].'/utils/cookie.txt');
        curl_setopt($ch, CURLOPT_COOKIEJAR, $_SERVER['DOCUMENT_ROOT'].'/utils/cookie.txt');
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (Windows; U; Windows NT 5.0; En; rv:1.8.0.2) Gecko/20070306 Firefox/1.0.0.4");
        $result = curl_exec($ch);
        curl_close($ch);
    
        return $result;
    }

    public function getAllProjectLinksByRegions()
    {
        $links = array(
            'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=WW,AQ,DKT,ELN,IZR,KFS,KON,MFE,STL,URW,WWM,ZPS&searchParams.crsKey=&searchParams.client=&nextPage=',
            'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=ADP,AG,AGW,AI,AMA,AME,AML,AMM,AMN,AMS,AMX,AN,AR,AW,BB,BM,BO,BR,BRJ,BS,BZ,CA,CAC,CAD,CDB,CL,CO,CR,CTT,CU,CW,DM,DO,EC,FIO,FK,FOI,GD,GF,GP,GT,GU,GY,HN,HT,JM,KAR,KN,KRP,KY,LC,MQ,MX,NI,OAS,OEC,OKI,OTA,PA,PE,PM,PR,PY,SIC,SR,SV,TT,US,UY,VC,VE,VG,VI&searchParams.crsKey=&searchParams.client=&nextPage=',
            'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=ACS,AE,AF,ASE,ASN,ASS,BD,BH,BN,BT,CN,ESC,FOS,GUS,HK,ID,IDB,IL,IMA,IN,IQ,IR,JO,JP,KG,KH,KP,KR,KW,KZ,LA,LB,LK,MKD,MM,MN,MO,MOS,MRC,MV,MY,NP,OM,PF,PH,PK,PS,QA,SA,SAA,SEA,SG,SY,TH,TJ,TL,TM,TW,UZ,VN,YE,ZAS&searchParams.crsKey=&searchParams.client=&nextPage=',
            'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=ABN,AFH,AFR,AFS,AO,AUK,BDE,BF,BI,BJ,BOA,BW,CBT,CD,CEA,CEM,CF,CG,CGL,CI,CIC,CIL,CM,CV,DJ,DZ,EAC,ECA,ECO,EG,ER,ET,FAC,FRS,GA,GH,GM,GN,GQ,GW,IAD,KE,KM,LR,LS,LY,MA,MAH,MG,ML,MR,MRU,MU,MW,MZ,NA,NE,NG,NON,OCA,OMV,RE,RW,SAD,SAH,SC,SD,SH,SL,SLS,SN,SO,SS,ST,SZ,TD,TG,TN,TZ,UG,ZA,ZM,ZW,EH,TF,NONE&searchParams.crsKey=&searchParams.client=&nextPage=',
            'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=AD,AL,AM,AT,AZ,BA,BE,BG,BY,CH,CS,CY,CZ,DDR,DE,DK,EE,ES,EUM,EUR,FI,FO,FR,GB,GE,GG,GI,GL,GR,HR,HU,IE,IM,IS,IT,JE,KAU,KOS,LI,LT,LU,LV,MC,MD,ME,MK,MOE,MS,MT,NL,NO,PL,PT,RO,RS,RU,SE,SI,SK,SM,SOE,STA,TC,TR,UA,VA,XK&searchParams.crsKey=&searchParams.client=&nextPage=',
            'https://www.giz.de/projektdaten/searchprojects.action?searchParams.searchToken=&searchParams.projectNumber=&searchParams.countryRegion=AS,AU,CC,CK,CX,FJ,FM,KI,MH,MP,NC,NF,NR,NU,NZ,OZE,PG,PW,PZI,SB,SPC,TK,TO,TV,UM,VU,WF,WS&searchParams.crsKey=&searchParams.client=&nextPage='
        );

        return $links;
    }

    public function getAllJsonFiles()
    {
        $files = scandir(PARSER_PATH.'/giz_project_json');
        foreach($files as $key => $file){
            if('.' == $file || '..' == $file){
                unset($files[$key]);
            }
        }

        return $files;
    }

    public function hasProjects($htmlDom)
    {   
        $results = $htmlDom->find('ol.searchresultcontent .item');
        $hasProjects = count($results) > 0 ? true : false;
        
        return $hasProjects;
    }

    public function keywordToQueryParameter($keyword)
    {
        $keyword = str_replace(' ', '%20', $keyword);

        return $keyword;
    }

    public function buildSearchUrl($keyword, $hits, $start)
    {
        $startParam = '';
        if($start){
            $startParam = '&start='.$start;
        }
        $keyword = $this->keywordToQueryParameter($keyword);
        $url = 'https://www.giz.de/search/?query='.$keyword.'&doc-types=html'.$startParam.'&hits='.$hits.'&countHits=true&site=www_giz_de_EN&hl=en_EN&loadTab=1';

        return $url;
    }

    public function prepareProjectUrlFromObject($urlObjects)
    {
        $projectUrl = array();

        foreach($urlObjects as $obj){
            $projectUrl[] = $obj->href;
        }

        return $projectUrl;
    }

    public function getAllProjectUrlsByKeyword($keyword)
    {
        $urls = array();
        $hasProjects = true;
        $hits = 50;        
        $start = 0;
        $i = 0;

        while($hasProjects){
            $start = $hits * $i;
            $searchResultUrl = $this->buildSearchUrl($keyword, $hits, $start);

            $resultDom = $this->getPageDom($searchResultUrl);             
            $hasProjects = $this->hasProjects($resultDom);
            if($hasProjects){
                $urlObjects = $resultDom->find('li.item a');
                $projectUrls = $this->prepareProjectUrlFromObject($urlObjects);
                $urls = array_merge($urls, $projectUrls);
            }

            $i++;
        }

        return $urls;

    }

    public function isProjectUrl($url)
    {
        $neededWord = 'worldwide';
        $isProject = stripos($url, $neededWord) === false ? false : true;

        return $isProject;
    }

    public function parse()
    {          
        $kewords = $this->getKeywords();

        foreach($kewords as $keyword){            
            $projectUrls = $this->getAllProjectUrlsByKeyword($keyword);

            foreach($projectUrls as $url){                
                $isProject = $this->isProjectUrl($url);

                if($isProject){
                    $projectDom = $this->getPageDom($url);                
                    $project = new GizProject($projectDom, $url);
                    $additionalData = array();
                    $additionalData['keywords'] = $keyword;
                    $project->parseDataFromRemote($additionalData);
                    $project->save();       
                }
            }
        }
    }
}

Zerion Mini Shell 1.0