Homework to review

  • Select the book name and the corresponding price and save it to books.txt
  • books.toscrape.com
  • The end result…

Job analysis

  • Two difficult
  • Simple is difficult
    • Crawls the first page of data under categories and categories, excluding paging
    • The necessary knowledge has already been covered; no additional knowledge is required
  • Medium difficulty
    • To crawl all data in categories and categories, you need to determine the total number of pages
    • Additional knowledge required for string cutting/interception

Job analysis

  1. First get the name and URL of the category
  2. Then through the URL to obtain the book information under the classification
  3. Note that the url of the category needs to be concatenated

Egg: Clear the screen before running

Code parsing – Easy – with comments


      


/** * _ooOoo_ * o8888888o * 88" . "88 * (| -_- |) * O\ = /O * ____/`---'\____ * .' \\| |// `. * / \\||| : | | | / / / _ \ * | | | | | - : - | | | | | - \ * | | \ \ \ / / / | | | * \ _ | '\ - /' | | * \. - \ __ ` ` ___ - / - / * ___ `.. '/ -- -- \ `.. __ *. "" '< `. ___ \ _ < | > _ / ___." "' > '. * | | : ` - \ `.; ` \ _ / `; . ` / - ` : | | * \ \ ` - \ _ __ \ / __ _ /. - ` / / * = = = = = = ` - ____ ` - ___ \ _____ / ___ - ` _____. - '= = = = = = * ` = - =' * ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ * Buddha bless never BUG * /

require 'vendor/autoload.php';
use QL\QueryList;
// Generate a QueryList object
$ql = new QueryList();

/ * *@Description: Gets the category *@params: Home page URL *@return: two-dimensional array, including category name, category URL */
function get_category($url)
{
    global $ql;

    $data = $ql->get($url)->rules([
        "category_name"= > ['#default > div > div > div > aside > div.side_categories > ul > li > ul > li > a'.'text']."category_url"= > ['#default > div > div > div > aside > div.side_categories > ul > li > ul > li > a'.'href'],
    ])->queryData();
    // complete the link address of the classification
    foreach ($data as $key => $value) {
        $value['category_url'] = $url . $value['category_url'];
        $data[$key] = $value;
    }
    return $data;
}

/ * *@Description: Get the book information under the category *@param: category URL *@return: two-dimensional array, including book name, book price */
function get_book($url)
{
    global $ql;

    $data = $ql->get($url)->rules([
        "book_name"= > ['#default > div > div > div > div > section > div:nth-child(2) > ol > li > article > h3 > a'.'title']."book_price"= > ['#default > div > div > div > div > section > div:nth-child(2) > ol > li> article > div.product_price > p.price_color'.'text'],
    ])->queryData();

    return $data;

}

/ * *@Description: Generates the final array *@param: The array * obtained from the class@return: Finally integrates an array of book information */
function make_array($data)
{
    foreach ($data as $key => $value) {
        echo $value['category_url']."\n";
        $value['books'] = get_book($value['category_url']);
        $data[$key] = $value;
    }
    return $data;
}
/ * *@Description: Write the TXT file *@param: integrated array *@return: Has no return value */ 
function make_txt($data)
{
    $txt_obj = fopen('books.txt'.'w+');
    foreach ($data as $key => $value) {
        $category_name = $value['category_name'];
        fwrite($txt_obj, "{$category_name}\n");
        foreach ($value['books'] as $k => $book) {
            $book_name = $book['book_name'];
            $book_price = $book['book_price'];
            fwrite($txt_obj, "\t\"{$book_name}\" {$book_price}\n");
        }
    }
    fclose($txt_obj);
}


// Crawl and consolidate data
$data = make_array(get_category('http://books.toscrape.com/'));
// Write the data to books.txt
make_txt($data);
Copy the code

Code parsing – Medium Difficulty – with comments


      


/** * _ooOoo_ * o8888888o * 88" . "88 * (| -_- |) * O\ = /O * ____/`---'\____ * .' \\| |// `. * / \\||| : | | | / / / _ \ * | | | | | - : - | | | | | - \ * | | \ \ \ / / / | | | * \ _ | '\ - /' | | * \. - \ __ ` ` ___ - / - / * ___ `.. '/ -- -- \ `.. __ *. "" '< `. ___ \ _ < | > _ / ___." "' > '. * | | : ` - \ `.; ` \ _ / `; . ` / - ` : | | * \ \ ` - \ _ __ \ / __ _ /. - ` / / * = = = = = = ` - ____ ` - ___ \ _____ / ___ - ` _____. - '= = = = = = * ` = - =' * ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ * Buddha bless never BUG * /

require 'vendor/autoload.php';
use QL\QueryList;
// Generate a QueryList object
$ql = new QueryList();

/ * *@Description: Gets the category *@params: Home page URL *@return: two-dimensional array, including category name, category URL */
function get_category($url)
{
    global $ql;

    $data = $ql->get($url)->rules([
        "category_name"= > ['#default > div > div > div > aside > div.side_categories > ul > li > ul > li > a'.'text']."category_url"= > ['#default > div > div > div > aside > div.side_categories > ul > li > ul > li > a'.'href'],
    ])->queryData();
    // complete the link address of the classification
    foreach ($data as $key => $value) {
        $value['category_url'] = $url . $value['category_url'];
        $data[$key] = $value;
    }
    return $data;
}

/ * *@Description: Get the book information under the category, if there is a next page, recursively get *@param: category URL *@return: two-dimensional array, including book name, book price */
function get_book($url)
{
    global $ql;
    echo $url."\n";
    $data = $ql->get($url)->rules([
        "book_name"= > ['#default > div > div > div > div > section > div:nth-child(2) > ol > li > article > h3 > a'.'title']."book_price"= > ['#default > div > div > div > div > section > div:nth-child(2) > ol > li> article > div.product_price > p.price_color'.'text'],
    ])->queryData();
    // Get the href of the next button to concatenate the full URL of the next page
    $next = has_next($url);
    if($next){
        // Generate the full URL
        $tmp_arr = explode('/',$url);
        $tmp_arr[count($tmp_arr)- 1] = $next;
        $next_url = implode('/',$tmp_arr);
        // Call get_book() to merge the returned data with the current data
        $data = array_merge($data,get_book($next_url));
    }
    return $data;
}
/ * *@Description: Determine if there is a next page *@param: Current URL *@returnIf there is an href for the return button, if not, the empty string */ is returned 
function has_next($url){
    global $ql;
    $res = $ql->get($url)->find('#default > div > div > div > div > section > div:nth-child(2) > div > ul > li.next > a')->href;
    return $res;
}
/ * *@Description: Generates the final array *@param: The array * obtained from the class@return: Finally integrates an array of book information */
function make_array($data)
{
    foreach ($data as $key => $value) {
        echo $value['category_url']."\n";
        $value['books'] = get_book($value['category_url']);
        $data[$key] = $value;
    }
    return $data;
}
/ * *@Description: Write the TXT file *@param: integrated array *@return: Has no return value */ 
function make_txt($data)
{
    $txt_obj = fopen('books.txt'.'w+');
    foreach ($data as $key => $value) {
        $category_name = $value['category_name'];
        fwrite($txt_obj, "{$category_name}\n");
        foreach ($value['books'] as $k => $book) {
            $book_name = $book['book_name'];
            $book_price = $book['book_price'];
            fwrite($txt_obj, "\t\"{$book_name}\" {$book_price}\n");
        }
    }
    fclose($txt_obj);
}


// Crawl and consolidate data
$data = make_array(get_category('http://books.toscrape.com/'));
// Write the data to books.txt
make_txt($data);
Copy the code

The next section,

  • PHP crawler — 011 PHP with MySQL