linkinfo/default.go

276 lines
5.2 KiB
Go

package linkinfo
import (
"errors"
"fmt"
"github.com/PuerkitoBio/goquery"
"io"
"io/ioutil"
"net"
"net/http"
"net/url"
"path"
"strconv"
"strings"
)
const (
contentTypeHtml = "text/html"
maxBodySizeBytes = 20971520
)
var privateIPBlocks []*net.IPNet
func init() {
for _, cidr := range []string{
"127.0.0.0/8", // IPv4 loopback
"10.0.0.0/8", // RFC1918
"172.16.0.0/12", // RFC1918
"192.168.0.0/16", // RFC1918
"::1/128", // IPv6 loopback
"fe80::/10", // IPv6 link-local
"fc00::/7", // IPv6 unique local addr
} {
_, block, err := net.ParseCIDR(cidr)
if err != nil {
panic(fmt.Errorf("parse error on %q: %v", cidr, err))
}
privateIPBlocks = append(privateIPBlocks, block)
}
}
func isPrivateIP(ip net.IP) bool {
for _, block := range privateIPBlocks {
if block.Contains(ip) {
return true
}
}
return false
}
func (api *LinkInfoApi) DefaultLinkHandler(link string) (*LinkInfo, error) {
redirects := make([]string, 0)
u, err := url.Parse(link)
if err != nil {
return nil, err
}
if u.Scheme != "http" && u.Scheme != "https" {
return nil, errors.New("unsupported scheme")
}
if u.Host == "localhost" || u.Host == "" {
return nil, errors.New("invalid host")
} else if ip := net.ParseIP(u.Host); ip != nil && isPrivateIP(ip) {
return nil, errors.New("url is a local ip address")
}
var req *http.Request
var res *http.Response
for i := 0; i < 10; i++ {
req, err = http.NewRequest(http.MethodHead, link, nil)
if err != nil {
return nil, err
}
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
if api.UserAgent != "" {
req.Header.Set("User-Agent", api.UserAgent)
}
res, err = api.Client.Do(req)
if err != nil {
return nil, err
}
if (res.StatusCode == 301 || res.StatusCode == 302) && res.Header.Get("Location") != "" {
link = res.Header.Get("Location")
redirectUrl, err := url.Parse(link)
if err != nil {
return nil, err
}
if redirectUrl.Host == "localhost" {
return nil, errors.New("url attempted to redirect to localhost")
} else if ip := net.ParseIP(redirectUrl.Host); ip != nil && isPrivateIP(ip) {
return nil, errors.New("host is a local ip address")
}
redirects = append(redirects, link)
} else {
break
}
}
if res != nil && res.StatusCode != 200 {
return nil, errors.New("invalid response, expected 200, got " + strconv.Itoa(res.StatusCode))
}
contentType := res.Header.Get("Content-Type")
if contentType == "" {
contentType = api.detectContentType(link, "application/octet-stream")
}
if idx := strings.Index(contentType, ";"); idx != -1 {
contentType = contentType[:idx]
}
var contentLength int64
if contentLengthStr := res.Header.Get("Content-Length"); contentLengthStr != "" {
contentLength, err = strconv.ParseInt(contentLengthStr, 10, 64)
}
ret := &LinkInfo{
ContentType: contentType,
ContentLength: contentLength,
}
switch contentType {
case contentTypeHtml:
if contentLength >= 0 && contentLength < maxBodySizeBytes {
err = api.retrieveHtmlLinkTitle(ret, link)
break
}
fallthrough
default:
ret.Title = fmt.Sprintf("%s (%s, %s)", path.Base(u.Path), contentType, ByteCountDecimal(contentLength))
}
return ret, err
}
func (api *LinkInfoApi) detectContentType(link, defaultType string) string {
req, err := http.NewRequest("GET", link, nil)
if err != nil {
return defaultType
}
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
if api.UserAgent != "" {
req.Header.Set("User-Agent", api.UserAgent)
}
req.Header.Set("Range", "bytes=0-512")
res, err := api.Client.Do(req)
if err != nil {
return defaultType
}
defer res.Body.Close()
b, err := ioutil.ReadAll(io.LimitReader(res.Body, 512))
if err != nil {
return defaultType
}
t := http.DetectContentType(b)
if t == "" {
t = defaultType
}
return t
}
var (
attrKeys = []string{"property", "name", "itemprop"}
)
func (api *LinkInfoApi) retrieveHtmlLinkTitle(i *LinkInfo, link string) error {
req, err := http.NewRequest(http.MethodGet, link, nil)
if err != nil {
return err
}
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
if api.UserAgent != "" {
req.Header.Set("User-Agent", api.UserAgent)
}
res, err := api.Client.Do(req)
if err != nil {
return err
}
defer res.Body.Close()
q, err := goquery.NewDocumentFromReader(io.LimitReader(res.Body, maxBodySizeBytes))
if err != nil {
return err
}
meta := q.Find("meta")
metaTags := make(map[string]string)
meta.Each(func(_ int, s *goquery.Selection) {
var key, value string
var exists bool
for _, k := range attrKeys {
key, exists = s.Attr(k)
if exists {
break
}
}
if key == "" {
return
}
value, exists = s.Attr("content")
if !exists {
return
}
metaTags[key] = value
})
var attr string
var exists bool
if attr, exists = metaTags["og:title"]; exists {
i.Title = attr
} else if tag := q.Find("head > title"); tag.Length() > 0 {
i.Title = tag.Text()
}
i.Title = strings.TrimSpace(i.Title)
if attr, exists = metaTags["og:description"]; exists {
i.Description = attr
} else if attr, exists = metaTags["description"]; exists {
i.Description = attr
}
i.Description = strings.TrimSpace(i.Description)
if attr, exists = metaTags["duration"]; exists {
i.Duration = attr
}
return nil
}