As part of the “a tour of Go” section on golang.org, I was trying to make a (formerly singlethreaded) web crawler parallelized using goroutines. I got it working but it doesn’t seem to “flow” right; there’s a bunch of duplicated code. Looking for advice as to how it can seem a bit more Go-literate.
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls ()string, err error)
}
type safeMap = struct {
seen map(string)bool
mu sync.Mutex
remaining int
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func gci(url string, depth int, fetcher Fetcher, m *safeMap, fc chan string) {
go crawl_int(url, depth, fetcher, m, fc)
}
func crawl_int(url string, depth int, fetcher Fetcher, m* safeMap, fc chan string) {
if depth > 0 {
body, urls, err := fetcher.Fetch(url)
//fc <- fmt.Sprintf("Crawling %sn", url)
m.mu.Lock()
defer m.mu.Unlock()
if err != nil {
fc <- fmt.Sprintf("%vn",err)
} else {
fc <- fmt.Sprintf("found: %s %q %dn", url, body, len(urls))
for _, u := range urls {
_, found := m.seen(u)
if !found {
m.remaining += 1
m.seen(u) = true
defer gci(u, depth-1, fetcher, m, fc)
}
}
}
} else {
m.mu.Lock()
defer m.mu.Unlock()
}
m.remaining -= 1
//fc <- fmt.Sprintf("finished %s remaining to %dn", url, m.remaining)
if (m.remaining == 0) {
//fc <- fmt.Sprintf("closing")
close(fc)
}
}
func Crawl(url string, depth int, fetcher Fetcher, ch chan string) {
// Fetches URLs in parallel.
// Doesn't fetch the same URL twice.
c := safeMap{seen: make(map(string)bool), remaining: 1}
go crawl_int(url, depth, fetcher, &c, ch)
}
func main() {
ch := make(chan string,5)
Crawl("https://golang.org/", 4, fetcher, ch)
for u := range ch {
fmt.Print(u)
}
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map(string)*fakeResult
type fakeResult struct {
body string
urls ()string
}
func (f fakeFetcher) Fetch(url string) (string, ()string, error) {
if res, ok := f(url); ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
()string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
()string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
()string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
()string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}