Flutter does not have Python’s powerful crawler framework, but it has its own plugin that parses HTML5. We can use it to parse some web pages, also a simple version of the crawler.
I’ll use the Biquge website to introduce the framework. We need to add a reference to pubspec.yaml:
HTML: ^ 0.14.0 + 3Copy the code
Analysis of the website
Take biquge search for “Yuan Zun” as an example.
From the picture above, we can see that a lot of information we want to crawl the website domain name, keyword, and url.
- The domain name
static const String baseImgUrl = "http://www.xbiqige.com";
static const String baseUrl = "http://www.xbiqige.com/";
Copy the code
- The url
Static const searchBook ="search.html? searchtype=novelname&searchkey=";
Copy the code
- The entity class information we want to show
If we want to display web search results in our App, we can construct our own entity class based on the information on the web page.
Class BookSearchItem {// final String bookName; // Book address final String bookUrl; // Author final String author; // final String lastUrl; // Final String lastTitle; // Article type Final Stringtype; // bookCover final String bookCover; BookSearchItem( this.author, this.lastUrl, this.lastTitle, this.type, this.bookCover, this.bookName, this.bookUrl); @override StringtoString() {
return 'BookSearchItem{bookName: $bookName, bookUrl: $bookUrl, author: $author, lastUrl: $lastUrl, lastTitle: $lastTitle, type: $type, bookCover: $bookCover}'; }}Copy the code
Parse web pages
Right mouse button, view the source code, the book read hundreds of times its meaning since see, has been looking, although not back down, but also can see a little clue, hey hey, maybe you put it back down…
Network request class
Tall buildings, from the base, data requests have always been grassroots work. Let’s construct our network request class.
Remember to introduce the DIO framework.
import 'package:dio/dio.dart';
class DioFactory {
static DioFactory get instance => _getInstance();
static DioFactory _instance;
Dio _dio;
BaseOptions _baseOptions;
DioFactory._internal(
{String basUrl = Config.baseUrl,
Map<String, dynamic> header = Config.headers}) {
_baseOptions = new BaseOptions(
baseUrl: basUrl,
connectTimeout: Config.connectTimeout,
responseType: ResponseType.json,
receiveTimeout: Config.receiveTimeout,
//headers: header
);
_dio = new Dio(_baseOptions);
}
static _getInstance() {
if (null == _instance) {
_instance = new DioFactory._internal();
}
return_instance; ** / Future<String> getString(url, {options, cancelToken, data}) async {print("get==>:$url,body:$data");
Response response;
try {
response = await _dio.get(url, cancelToken: cancelToken);
} on DioError catch (e) {
if (CancelToken.isCancel(e)) {
print('Get request cancelled! ' + e.message);
} else {
print('Get request error: $e'); / /}}print(response.data.toString());
return response == null ? "" : response.data.toString();
}
}
class Config {
static const String baseImgUrl = "http://www.xbiqige.com";
static const String baseUrl = "http://www.xbiqige.com/"; Static const int connectTimeout = 8000; /// The interval in milliseconds between two data receives on the response stream. ReceiveTimeout if the interval is greater than [Dio], /// [Dio] will raise a [DioErrorType.RECEIVE_TIMEOUT] exception. Static const int receiveTimeout = 3000; static const int receiveTimeout = 3000; Static const Map<String, dynamic> headers = {"Accept": "application/json"}; Static const Map<String, dynamic> headersJson = {"Accept": "application/json"."Content-Type": "application/json; charset=UTF-8"}; }Copy the code
The HTML plug-in parses the code
We should first analyze the code on the page to see if there is any logic to follow.
Future<List<BookSearchItem>> fetchSearchBook(String bookName) async {var response; List<BookSearchItem> books = new List(); Response = await net.getString()"${StringApi.searchBook}$bookName"); var document = parse(response); Var content = document.querySelector(".librarylist"); Var lefts = content.querySelectorAll();".pt-ll-l"); // Find all pt-ll-lVar rights = content.querySelectorAll(".pt-ll-r"); Int count = lefts.length > rights.length? rights.length : lefts.length; // Take the shortest data, this is to ensure that the array is not out of boundsfor(int i = 0; i < count; I++) {// in pt-ll-lItem = new BookSearchItem(Rights [I]. QuerySelectorAll (".info>span")[1].text.trim(),// Second span element value, get author Rights [I].QuerySelector (".last>a").attributes["href"].trim(),//href property value, last chapter Url rights[I].QuerySelector (".last>a").text.trim(),// Element value, get title rights[I].QuerySelectorAll (".info>span")[2].text.trim(),// The third span element value, get novel classification lefts[I].QuerySelector ("div>a>img").attributes['src'].trim(),// Get novel image lefts[I].QuerySelector ("div>a>img").attributes['alt'].trim(),// Get the name lefts[I].QuerySelector ("div>a").attributes['href'].trim()); Books.add (item);print(item.toString());
}
return books;
} catch (e) {
print(e);
}
return books;
}
Copy the code
The page display
class DemoBiqugePage extends StatefulWidget {
@override
_DemoBiqugePageState createState() => _DemoBiqugePageState();
}
class _DemoBiqugePageState extends State<DemoBiqugePage> {
Api _api = new Api();
List<BookSearchItem> _books = new List();
@override
void initState() {
// TODO: implement initState
super.initState();
_api.fetchSearchBook("Yuan chun").then((data) {
setState(() {
_books.clear();
_books.addAll(data);
});
});
}
@override
Widget build(BuildContext context) {
return Scaffold(
appBar: AppBar(
title: Text("Biquge crawler Sample"),
),
body: _books.length == 0
? Center(
child: Text("Loading data..."),
)
: Container(
margin: EdgeInsets.all(10),
child: ListView.separated(
itemBuilder: (BuildContext context, int index) {
return item_book_search(context, _books[index]);
},
separatorBuilder: (BuildContext context, int index) {
return Divider(
height: 2,
color: Theme.of(context).primaryColor,
);
},
itemCount: _books.length),
),
);
}
}
Widget item_book_search(BuildContext context, BookSearchItem book) {
returnContainer( margin: EdgeInsets.only(left: 10, right: 10, top: 10), child: InkWell( borderRadius: BorderRadius.circular(20), onTap: () {}, child: Container( child: Container( height: 140, margin: EdgeInsets.all(10), child: Row( children: <Widget>[ Container( height: 120, width: 80, child: CachedNetworkImage( imageUrl: Config.baseImgUrl + book.bookCover), ), Expanded( child: Container( margin: EdgeInsets.only(left: 30), child: Column( mainAxisAlignment: MainAxisAlignment.center, crossAxisAlignment: CrossAxisAlignment.stretch, children: <Widget>[ Container( margin: EdgeInsets.only(bottom: 20), child: Text(book.bookname, style: TextStyle(fontWeight: fontweight.w500, color: colors.black, fontSize: 18.0), overflow: TextOverflow.ellipsis, maxLines: 1, ), ), Row( children: <Widget>[ Container( margin: EdgeInsets.only(top: 0), child: Text("Type:" + book.type.split(':'[1], style: TextStyle(fontWeight: fontweight.w300, color: color.black, fontSize: 12.0), overflow: TextOverflow.ellipsis, maxLines: 1, ), ), Container( margin: EdgeInsets.only(left: 20), child: Text(book.author, style: TextStyle(fontWeight: fontweight.w300, color: colors.black, fontSize: 12.0), overflow: TextOverflow.ellipsis, maxLines: 2, ), ) ], ), Container( margin: EdgeInsets.only(top: 20), child: Text(book.lasttitle, style: TextStyle(fontWeight: fontweight.w400, color: colors.black, fontSize: 16.0), overflow: TextOverflow.ellipsis, maxLines: 2, ), ), ], ), )) ], )), ), )); }Copy the code
rendering
conclusion
The plugins used above are:
Dio: ^2.1.10 cached_network_image: ^2.0.0 HTML: ^0.14.0+3Copy the code
The above code directly copy, paste into their own Demo can run, if you compare with the PEN Fun ge website page H5 code, the effect will be better. The parsing framework for Flutter is certainly not as powerful as Python’s, but regular expressions are the most important way to match strings. It’s not easy to extract the data you want from a sea of code. And line and cherish.