web scrape使用preg_match_all pag HTML

问题描述 投票:0回答:1

我正在尝试使用PHP从这个站点http://inkastudios.ml/test/?placa=D2D361获取车辆的信息。我无法独立于HTML中的表格获取数据。

$url = "URL".$placa;
        $this->cc->referer($url);
        $Page = $this->cc->post($url,$data);
        //echo $Page;
        if($Page)
        {

            // Search Nro Placa
            //<td class="celdaclaro"><b>D2D361 </b></td>
            $patron='<td class=\"celdaclaro\">(.*)<\/b><\/td>';
            $output = preg_match_all($patron, $Page, $matches, PREG_SET_ORDER);
            if(isset($matches[0]))
            {
                $rtn += array("NroPlaca"=>trim($matches[0][1]));
            }

文件项目: cookies.txt:这个文件是由libcurl生成的!编辑风险自负。

的index.php

require ("curl.php");
require ("sunarp.php");

$search = new Sunarp();
//$placa="D2D361";

echo json_encode( $search->BuscaDatosSunarp($placa), JSON_PRETTY_PRINT );?>

sunarp.php:

<?php
class Sunarp
{
    var $path = "";
    function __construct()
    {
        $this->path = dirname(__FILE__);
        //$this->cc = new cURL(true);
        $this->cc = new cURL(true,'URL',$this->path.'/cookies.txt');

    }
    function BuscaDatosSunarp($NroPlaca="")
    {
        $rtn = array();
        if( $NroPlaca!="" )
        {
            $data = array(

            );

            if (isset($_REQUEST['placa'])) {
            $placa= $_REQUEST['placa'];
            } else {
            $placa = "";
            }



            $url = "URL".$placa;
            $this->cc->referer($url);
            $Page = $this->cc->post($url,$data);
            //echo $Page;
            if($Page)
            {
                // Busca Nro Placa
                $patron='<td class=\"celdaclaro\" colspan=\"3\">(.*)<\/td>';
                $output = preg_match_all($patron, $Page, $matches, PREG_SET_ORDER);
                if(isset($matches[0]))
                {
                    $rtn += array("NroPlaca"=>trim($matches[0][1]));
                }


            }
            if(count($rtn) > 2)
            {
                return $rtn;
            }
        }
        return false;
    }
}

curl.php:CURL库

<?php
class cURL
{
    var $headers;
    var $user_agent;
    var $compression;
    var $cookie_file;
    var $proxy;
    var $referer;
    var $info;
    var $error;
    var $url = false;

    var $request_cookies = '';
    var $response_cookies = '';
    var $content = '';

    function getInfo()
    {
        return $this->info;
    }
    function cURL($cookies=TRUE,$referer='https://www.google.com',$cookie='cookies.txt',$compression='gzip,deflate',$proxy='')
    {
        $this->headers[0] = "Accept-Encoding: gzip, deflate, sdch";
        $this->headers[] = "Accept-Language: es-419,es;q=0.8";
        $this->headers[] = "User-Agent: Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36";
        $this->headers[] = "Content-Type: application/x-www-form-urlencoded";
        $this->headers[] = "DNT: 1";
        $this->headers[] = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
        $this->headers[] = "X-Requested-With: XMLHttpRequest";
        $this->headers[] = "Connection: keep-alive";

        $this->user_agent = 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36';
        $this->compression=$compression;
        $this->proxy=$proxy;
        $this->cookies=$cookies;
        $this->referer=$referer;
        if($this->cookies == TRUE)
            $this->cookie($cookie);
    }
    function cookie($cookie_file)
    {
        if (file_exists($cookie_file))
        {
            $this->cookie_file=$cookie_file;
        }
        else
        {
            //fopen($cookie_file,'w') or $this->error('The cookie file could not be opened. Make sure this directory has the correct permissions');
            //fclose($this->cookie_file);
            file_put_contents($cookie_file,"");
            $this->cookie_file=$cookie_file;
        }
    }
    function post( $url, array $post = array(), array $options = array() )
    {
        $defaults = array(
            CURLOPT_HEADER => true,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_SSL_VERIFYPEER => false,
            CURLOPT_REFERER => $this->referer,
            CURLOPT_USERAGENT => $this->user_agent,
            CURLOPT_COOKIEFILE => $this->cookie_file,
            CURLOPT_COOKIEJAR => $this->cookie_file,
            CURLOPT_URL => $url,
            CURLOPT_FRESH_CONNECT => true,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FORBID_REUSE => true,
            CURLOPT_TIMEOUT => 250,
            CURLOPT_ENCODING => $this->compression,
            CURLOPT_HTTPHEADER => $this->headers,
            CURLINFO_HEADER_OUT => true,
            CURLOPT_POST => true,
            CURLOPT_POSTFIELDS => http_build_query($post)
        );
        $ch = curl_init();
        curl_setopt_array($ch, ($options + $defaults));
        if(!$result = curl_exec($ch))
        {
            curl_close($ch);
            return false;
        }
        $this->error    = curl_getinfo($ch,CURLINFO_HTTP_CODE);
        $this->url      = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL);
        //$this->url        = curl_getinfo($ch,CURLINFO_REDIRECT_URL);
        if($this->error < 400)
        {
            curl_close($ch);
            return $result;
        }
        curl_close($ch);
        return false;
    }

    function get( $url, array $options = array() )
    {
        $defaults = array(
            CURLOPT_HEADER => false,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_SSL_VERIFYPEER => false,
            CURLOPT_REFERER => $this->referer,
            CURLOPT_USERAGENT => $this->user_agent,
            CURLOPT_COOKIEFILE => $this->cookie_file,
            CURLOPT_COOKIEJAR => $this->cookie_file,
            CURLOPT_URL => $url,
            CURLOPT_FRESH_CONNECT => true,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FORBID_REUSE => true,
            CURLOPT_TIMEOUT => 250,
            CURLOPT_ENCODING => $this->compression,
            CURLOPT_HTTPHEADER => $this->headers
        );
        $ch = curl_init();
        curl_setopt_array($ch, ($options + $defaults));
        if(!$result = curl_exec($ch))
        {
            curl_close($ch);
            return false;
        }
        $this->error    = curl_getinfo($ch,CURLINFO_HTTP_CODE);
        $this->url      = curl_getinfo($ch,CURLINFO_EFFECTIVE_URL);
        //$this->url        = curl_getinfo($ch,CURLINFO_REDIRECT_URL);
        if($this->error < 400)
        {
            curl_close($ch);
            return $result;
        }
        curl_close($ch);
        return false;
    }

    function referer($url = "https://google.com.pe/")
    {
        $this->referer=$url;
    }

    function error($error)
    {
        echo "<center><div style='width:500px;border: 3px solid #FFEEFF; padding: 3px; background-color: #FFDDFF;font-family: verdana; font-size: 10px'><b>cURL Error</b><br>$error</div></center>";
        die;
    }

    // ////////////////////////////////////////////////

    function set_cookies_string($cookies)
    {
        $this->response_cookies = 'Cookie: ' . $cookies . "\r\n";
    }

    private function get_cookies( $http_response_header = array() )
    {
        $cookies = "";
        if( is_array($http_response_header) )
        foreach($http_response_header as $s)
        {
            $patron = '/Set-Cookie: (.*)/';
            $output = preg_match_all($patron, $s, $matches, PREG_SET_ORDER);
            if(isset($matches[0]))
            {
                $cookies = trim($matches[0][1]);
            }
        }

        if($this->response_cookies != 'Cookie: ' . $cookies . "\r\n" && $cookies != "")
        {
            $this->response_cookies = 'Cookie: ' . $cookies . "\r\n";
        }
    }

    function get2( $url, array $options = array() )
    {
        $defaults = array(
            'method'    => 'GET',
            'header'    => join("\r\n", $this->headers) . "\r\n" . $this->response_cookies,
            'timeout'   => 600
        );
        $options += $defaults;
        $opts = array(
            'http' => $options
        );

        $context = stream_context_create($opts);
        $this->content = file_get_contents($url, false, $context);
        $this->get_cookies($http_response_header);

        return $this->content;
    }

    function post2( $url, $post_data, array $options = array() )
    {
        $post_content = array();
        foreach ($post_data as $key => $value)
        {
            $post_content[] = $key .'='.$value;
        }

        $defaults = array(
                'method' => 'POST',
                'header' => join("\r\n", $this->headers) . "\r\n" . $this->response_cookies,
                'content' => join('&', $post_content),
                'timeout' => 600
        );

        $options += $defaults;
        $opts = array(
            'http' => $options
        );

        $context = stream_context_create($opts);
        $this->content = file_get_contents($url, false, $context);
        $this->get_cookies($http_response_header);

        return $this->content;
    }
}?>
php curl preg-match-all
1个回答
1
投票

正则表达式不是解析html的工具。而是使用DOMDocument构建树结构和DOMXPath来查询它:

$dom = new DOMDocument;
$state = libxml_use_internal_errors(true);
$dom->loadHTMLFile('http://inkastudios.ml/test/?placa=D2D361');
libxml_use_internal_errors($state);

$xp = new DOMXPath($dom);

$NroPlaca = trim($xp->evaluate('string(//td[@class="celdaclaro"]/b)'));
© www.soinside.com 2019 - 2024. All rights reserved.