package linkinfo import ( "errors" "fmt" "github.com/PuerkitoBio/goquery" "io" "io/ioutil" "net" "net/http" "net/url" "path" "strconv" "strings" ) const ( contentTypeHtml = "text/html" maxBodySizeBytes = 20971520 ) var privateIPBlocks []*net.IPNet func init() { for _, cidr := range []string{ "127.0.0.0/8", // IPv4 loopback "10.0.0.0/8", // RFC1918 "172.16.0.0/12", // RFC1918 "192.168.0.0/16", // RFC1918 "::1/128", // IPv6 loopback "fe80::/10", // IPv6 link-local "fc00::/7", // IPv6 unique local addr } { _, block, err := net.ParseCIDR(cidr) if err != nil { panic(fmt.Errorf("parse error on %q: %v", cidr, err)) } privateIPBlocks = append(privateIPBlocks, block) } } func isPrivateIP(ip net.IP) bool { for _, block := range privateIPBlocks { if block.Contains(ip) { return true } } return false } func (api *LinkInfoApi) DefaultLinkHandler(link string) (*LinkInfo, error) { redirects := make([]string, 0) u, err := url.Parse(link) if err != nil { return nil, err } if u.Scheme != "http" && u.Scheme != "https" { return nil, errors.New("unsupported scheme") } if u.Host == "localhost" || u.Host == "" { return nil, errors.New("invalid host") } else if ip := net.ParseIP(u.Host); ip != nil && isPrivateIP(ip) { return nil, errors.New("url is a local ip address") } var req *http.Request var res *http.Response for i := 0; i < 10; i++ { req, err = http.NewRequest(http.MethodHead, link, nil) if err != nil { return nil, err } req.Header.Set("Accept-Language", "en-US,en;q=0.9") if api.UserAgent != "" { req.Header.Set("User-Agent", api.UserAgent) } res, err = api.Client.Do(req) if err != nil { return nil, err } if (res.StatusCode == 301 || res.StatusCode == 302) && res.Header.Get("Location") != "" { link = res.Header.Get("Location") redirectUrl, err := url.Parse(link) if err != nil { return nil, err } if redirectUrl.Host == "localhost" { return nil, errors.New("url attempted to redirect to localhost") } else if ip := net.ParseIP(redirectUrl.Host); ip != nil && isPrivateIP(ip) { return nil, errors.New("host is a local ip address") } redirects = append(redirects, link) } else { break } } if res != nil && res.StatusCode != 200 { return nil, errors.New("invalid response, expected 200, got " + strconv.Itoa(res.StatusCode)) } contentType := res.Header.Get("Content-Type") if contentType == "" { contentType = api.detectContentType(link, "application/octet-stream") } if idx := strings.Index(contentType, ";"); idx != -1 { contentType = contentType[:idx] } var contentLength int64 if contentLengthStr := res.Header.Get("Content-Length"); contentLengthStr != "" { contentLength, err = strconv.ParseInt(contentLengthStr, 10, 64) } ret := &LinkInfo{ ContentType: contentType, ContentLength: contentLength, } switch contentType { case contentTypeHtml: if contentLength >= 0 && contentLength < maxBodySizeBytes { err = api.retrieveHtmlLinkTitle(ret, link) break } fallthrough default: ret.Title = fmt.Sprintf("%s (%s, %s)", path.Base(u.Path), contentType, ByteCountDecimal(contentLength)) } return ret, err } func (api *LinkInfoApi) detectContentType(link, defaultType string) string { req, err := http.NewRequest("GET", link, nil) if err != nil { return defaultType } req.Header.Set("Accept-Language", "en-US,en;q=0.9") if api.UserAgent != "" { req.Header.Set("User-Agent", api.UserAgent) } req.Header.Set("Range", "bytes=0-512") res, err := api.Client.Do(req) if err != nil { return defaultType } defer res.Body.Close() b, err := ioutil.ReadAll(io.LimitReader(res.Body, 512)) if err != nil { return defaultType } t := http.DetectContentType(b) if t == "" { t = defaultType } return t } var ( attrKeys = []string{"property", "name", "itemprop"} ) func (api *LinkInfoApi) retrieveHtmlLinkTitle(i *LinkInfo, link string) error { req, err := http.NewRequest(http.MethodGet, link, nil) if err != nil { return err } req.Header.Set("Accept-Language", "en-US,en;q=0.9") if api.UserAgent != "" { req.Header.Set("User-Agent", api.UserAgent) } res, err := api.Client.Do(req) if err != nil { return err } defer res.Body.Close() q, err := goquery.NewDocumentFromReader(io.LimitReader(res.Body, maxBodySizeBytes)) if err != nil { return err } meta := q.Find("meta") metaTags := make(map[string]string) meta.Each(func(_ int, s *goquery.Selection) { var key, value string var exists bool for _, k := range attrKeys { key, exists = s.Attr(k) if exists { break } } if key == "" { return } value, exists = s.Attr("content") if !exists { return } metaTags[key] = value }) var attr string var exists bool if attr, exists = metaTags["og:title"]; exists { i.Title = attr } else if tag := q.Find("head > title"); tag.Length() > 0 { i.Title = tag.Text() } i.Title = strings.TrimSpace(i.Title) if attr, exists = metaTags["og:description"]; exists { i.Description = attr } else if attr, exists = metaTags["description"]; exists { i.Description = attr } i.Description = strings.TrimSpace(i.Description) if attr, exists = metaTags["duration"]; exists { i.Duration = attr } return nil }