我正在学习 Go 的并发性,并且一直面临构建一个简单的并发网络爬虫的问题。抓取发生在 goroutine(抓取)中,该 goroutine 被调用的次数与要抓取的 URL 的数量相同。要抓取的 URL 在通道中传递,对于每个抓取的页面,我提取标题并将其发送到 resultPage。
最后我想打印爬取的标题列表,但是程序在爬取 URL 后阻塞,并且永远不会到达旨在读取结果通道的最终 for 循环。
这里是
main.go
package main
import (
"me/webcrawler/crawler"
"fmt"
"sync"
)
func crawl(link string, urlList chan<- string, resultList chan<- string, wg *sync.WaitGroup) {
// The defer keyword will defer the Done operation on the wait group by the end of the function
defer wg.Done()
basePage, err := crawler.FetchUrl(link)
if err != nil {
fmt.Println("Error: ", err)
return
}
// We add the title to the result channel
resultList <- basePage.Title
// We add the urls to the url channel
for _, url := range basePage.UrlList {
urlList <- url
}
}
func main() {
startUrl := "http://golang.org"
basePage, err := crawler.FetchUrl(startUrl)
//main wait group for sync
var wg sync.WaitGroup
// url and result channels for communication
urlChan := make(chan string)
resultChan := make(chan string)
if err != nil {
fmt.Println("Error: ", err)
return
}
for _, u := range basePage.UrlList {
// check if the url start with / then it's a relative url
fmt.Println("URL: ", u)
fullUrl, err := crawler.ResolveURL(u, startUrl)
if err != nil {
fmt.Println("Error: ", err)
continue
}
wg.Add(1)
// start a go routine to crawl the url
fmt.Println("Crawling: ", fullUrl)
go crawl(fullUrl, urlChan, resultChan, &wg)
}
wg.Wait()
close(urlChan)
close(resultChan)
for r := range resultChan {
fmt.Println("Title: ", r)
}
}
这是
crawler.go
,其中包含一些抓取页面的方法
type PageInfo struct {
Title string
UrlList []string
}
// FetchUrl fetches the content of a URL and returns it as a byte slice.
func FetchUrl(url string) (PageInfo, error) {
resp, err := http.Get(url)
if err != nil {
fmt.Println("Error: ", err)
return PageInfo{}, err
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
// Extract the list of URLS and the title of the page
urls, err := ExtractUrls(body)
title, err := ExtractTitle(body)
if err != nil {
fmt.Println("Error: ", err)
return PageInfo{}, nil
}
return PageInfo{title, urls}, nil
}
func ExtractUrls(page []byte) ([]string, error) {
/** Logic to extract the URLs **/
}
func ExtractTitle(page []byte) (string, error) {
/** Logic to extract the title **/
}
/** Helper method to validate the URL: accept only URLs that are part of Golang origin**/
func ResolveURL(inputURL, base string) (string, error) {
/** Some logic to resolve the URL **/
}
工作人员阻止发送至
resultChan
。 主程序没有收到 resultChan
的消息,因为它正在等待工作人员完成。陷入僵局!
通过将等待/关闭移动到 goroutine 来修复:
go func() {
wg.Wait()
close(resultChan)
}()
工作人员还阻止发送到
urlChan
,但我无法建议修复该问题,因为没有从该通道接收的代码。