简介
破解 google 搜索接口,进行代码自动化接入 google engine,而不是通过 web driver方式,不占服务器太多带宽
利用 proto 文件作为配置结构,请自行利用 protoc 编译成对于的 pb.go 文件,配置文件采用 yaml,请自行实现读取配置关联
目录结构
代码
- conf.proto
syntax = "proto3";
package kxconfig;
option go_package = "ggsearchengine";
import "google/protobuf/struct.proto";
import "google/protobuf/duration.proto";
message Bootstrap {
Client client = 5;
}
message Client {
message HTTP {
google.protobuf.Duration timeout = 1;
optional string proxy = 2;
int32 max_idle_conns = 3;
int32 max_conns_per_host = 4;
int32 max_idle_conns_per_host = 5;
google.protobuf.Duration proxy_timeout = 6;
optional string premium_proxy = 7;
google.protobuf.Duration premium_proxy_timeout = 8;
}
HTTP http = 1;
}
- config.yaml
client:
http:
timeout: 10s
max_idle_conns: 10
max_conns_per_host: 100
max_idle_conns_per_host: 10
proxy: 127.0.0.1n:8118
premium_proxy: 127.0.0.1n:8228
proxy_timeout: 10s
premium_proxy_timeout: 10s
- google_search_engine.go
package ggsearchengine
import (
"bytes"
"context"
"fmt"
"io/ioutil"
"net/http"
"regexp"
"strings"
"time"
ggenerator "ggsearch_engine/ggsearchengine/useragent"
"github.com/PuerkitoBio/goquery"
"github.com/go-kratos/kratos/v2/errors"
"github.com/go-kratos/kratos/v2/log"
)
var (
baseUrl = "https://www.google.com/search"
Err = errors.New(400, "GOOGLE_HTTP_BAD_CODE", "google search http bad code")
)
// search engine 属性不能包含每次搜索变化的变量, search engine 属性是全局唯一的
type googleSearchEngine struct {
log *log.Helper
nProxyClient *NoPoolProxyHttpClient
}
func NewGoogleSearchEngine(logger log.Logger, nProxyClient *NoPoolProxyHttpClient) SearchEngine {
return &googleSearchEngine{
log: log.NewHelper(logger),
nProxyClient: nProxyClient,
}
}
func WithMonitor(metricKey string) Option {
return func(o *options) {
o.metricKey = &metricKey
}
}
func WithProxy(isFreeProxy bool) Option {
return func(o *options) {
o.isFreeProxy = isFreeProxy
}
}
func WithTextExtract(textExtractionSyntax string, textExtractionIndex []int) Option {
return func(o *options) {
o.textExtractionSyntax = &textExtractionSyntax
o.textExtractionIndex = textExtractionIndex
}
}
func (g *googleSearchEngine) textExtract(ctx context.Context, text string, extractSyntax string, extractIndexes []int) []string {
regx := regexp.MustCompile(extractSyntax)
extractContents := regx.FindStringSubmatch(text)
// 按照提取顺序返回
result := make([]string, len(extractIndexes))
for i, extractIndex := range extractIndexes {
if len(extractContents) < extractIndex+1 {
continue
} else {
result[i] = extractContents[extractIndex]
}
}
return result
}
func (g *googleSearchEngine) parseGoogleResponse(ctx context.Context, o *options, r []byte) (results []*SearchResult, err error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(r))
if err != nil {
return nil, err
}
results = make([]*SearchResult, 0)
doc.Find(".xpd").Has("a>h3").Each(func(i int, s *goquery.Selection) {
var (
title string
link string
content string
)
result := &SearchResult{}
// 返回的数组是提取的多个关键字部分
result.Content = make([]string, 0)
// 标题
title = s.Find("a>h3").First().Text()
// 内容
contents := make([]string, 0)
// 提取解析内容
s.Find("div").Each(func(i int, selection *goquery.Selection) {
contents = append(contents, selection.Text())
})
content = strings.Join(contents, ",")
if o.textExtractionSyntax != nil {
extractionContent := g.textExtract(ctx, content, *o.textExtractionSyntax, o.textExtractionIndex)
result.Content = extractionContent
} else {
result.Content = []string{content}
}
// 提取解析 link
originLink := s.Find("a").First().AttrOr("href", "")
regx := regexp.MustCompile(`(?U)^.*/url\?q=.*(?P<link>.*)&sa=.*`)
extractLink := regx.FindStringSubmatch(originLink)
if len(extractLink) >= 2 {
link = extractLink[1]
}
result.Title = title
result.Url = link
results = append(results, result)
})
return results, nil
}
func (g *googleSearchEngine) sendGoogleRequest(ctx context.Context, o *options, pageNum, pageSize uint32, query string, uule string) (ret []byte, err error) {
params := map[string]string{
"newwindow": "1",
"gbv": "1",
"num": fmt.Sprintf("%d", pageSize),
"start": fmt.Sprintf("%d", pageNum-1),
"ie": "UTF-8",
"oe": "UTF-8",
"hl": "en-US",
"q": query,
}
//uule为空表示搜全球
if uule != "" {
params["uule"] = uule
}
req, err := newHttpGetRequest(baseUrl, params)
if err != nil {
return nil, err
}
headers := []string{ggenerator.USER_AGENT}
headerMap := ggenerator.HeaderGenerator(headers)
for headerKey, headerValue := range headerMap {
req.Header.Add(headerKey, headerValue)
}
g.log.WithContext(ctx).Infof("get google search:%s, header: %v, request parameter:%v",
baseUrl, headerMap, params)
begin := time.Now()
resp, err := getHttpClient(g.nProxyClient, o.isFreeProxy).Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
respBody, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, err
}
if resp.StatusCode != http.StatusOK {
g.log.WithContext(ctx).Infof("get google search:%s, response:%s",
resp.Request.URL, string(respBody))
err = Err
}
// 监控
setMonitor(o.metricKey, begin, err)
if err != nil {
return nil, err
}
return respBody, nil
}
func (g *googleSearchEngine) Search(ctx context.Context, pageNum, pageSize uint32, query, uule string, opts ...Option) (results []*SearchResult, err error) {
o := &options{
isFreeProxy: true,
}
// 执行 option
for _, opt := range opts {
opt(o)
}
body, err := g.sendGoogleRequest(ctx, o, pageNum, pageSize, query, uule)
if err != nil {
return nil, err
}
results, err = g.parseGoogleResponse(ctx, o, body)
if err != nil {
return nil, err
}
return results, nil
}
- iface.go
package ggsearchengine
import "context"
type SearchEngine interface {
Search(ctx context.Context, pageNum, pageSize uint32, query, uule string, options ...Option) (results []*SearchResult, err error)
}
- native_client.go
package ggsearchengine
import (
"net/http"
"net/url"
"github.com/go-kratos/kratos/v2/log"
)
type NoPoolProxyHttpClient struct {
Client *http.Client
PremiumClient *http.Client
}
func NewNoPoolProxyHttpClient(bc *Bootstrap, logger log.Logger) *NoPoolProxyHttpClient {
httpConf := bc.Client.Http
httpTransport := http.DefaultTransport.(*http.Transport).Clone()
premiumHttpTransport := http.DefaultTransport.(*http.Transport).Clone()
// 代理设置
if bc.Client.Http.Proxy != nil {
httpTransport.Proxy = setProxy(*bc.Client.Http.Proxy)
}
if bc.Client.Http.PremiumProxy != nil {
premiumHttpTransport.Proxy = setProxy(*bc.Client.Http.PremiumProxy)
}
// 必须关闭http keep-alive 设置
httpTransport.DisableKeepAlives = true
premiumHttpTransport.DisableKeepAlives = true
noPoolClient := &NoPoolProxyHttpClient{
Client: &http.Client{
Transport: httpTransport,
Timeout: httpConf.ProxyTimeout.AsDuration(),
},
PremiumClient: &http.Client{
Transport: premiumHttpTransport,
Timeout: httpConf.PremiumProxyTimeout.AsDuration(),
},
}
return noPoolClient
}
func setProxy(p string) func(*http.Request) (*url.URL, error) {
return func(req *http.Request) (*url.URL, error) {
iurl := url.URL{}
return iurl.Parse(p)
}
}
- types.go
package ggsearchengine
import (
"net/http"
"net/url"
"strings"
"time"
)
type SearchResult struct {
Title string
Content []string
Url string
}
type Option func(*options)
type options struct {
metricKey *string
isFreeProxy bool
textExtractionSyntax *string
textExtractionIndex []int
}
func newHttpGetRequest(baseUrl string, params map[string]string) (*http.Request, error) {
var urlBuilder strings.Builder
urlBuilder.WriteString(baseUrl)
urlBuilder.WriteString("?")
flag := true
for k, v := range params {
if flag {
flag = false
} else {
urlBuilder.WriteString("&")
}
urlBuilder.WriteString(k)
urlBuilder.WriteString("=")
urlBuilder.WriteString(url.QueryEscape(v))
}
return http.NewRequest("GET", urlBuilder.String(), nil)
}
// 设置监控
// 此处为公司敏感信息,请自行根据prometheus exporter 和 metric 实现
func setMonitor(metricKey *string, begin time.Time, err error) {
if metricKey != nil {
// 上报耗时
if err != nil {
// 上报错误计数
} else {
// 上报成功计数
}
}
}
func getHttpClient(nProxyClient *NoPoolProxyHttpClient, isFreeProxy bool) *http.Client {
var httpClient *http.Client
if isFreeProxy {
httpClient = nProxyClient.Client
} else {
httpClient = nProxyClient.PremiumClient
}
return httpClient
}
- useragent/data.go
package ggenerator
const (
USER_AGENT = "User-Agent"
)
var userAgentList = []string{
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36 Edg/100.0.1185.29",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 OPR/85.0.4341.39",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
"Mozilla/5.0 (Linux; U; Android 9; itel W6004 Build/PPR1.180610.011; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/79.0.3945.116 Mobile Safari/537.36 OPR/55.0.2254.56695",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.55",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36",
"Mozilla/5.0 (X11; CrOS x86_64 14268.67.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.111 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.43",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
"Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Mobile Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
"Mozilla/5.0 (Linux; U; Android 6.0.1; SM-N910C Build/MMB29K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/96.0.4664.45 Mobile Safari/537.36 OPR/62.1.2254.60552",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.62",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_2_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.2 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.30",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
"Mozilla/5.0 (Linux; U; Android 10; TECNO KD6a Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/80.0.3987.99 Mobile Safari/537.36 OPR/62.3.2254.60988",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36",
}
- useragent/useragent_generator.go
package ggenerator
import (
"math/rand"
"time"
)
func RandomRange(min, max int) int {
rand.Seed(time.Now().UnixNano())
return rand.Intn(max+1-min) + min
}
func randomSelectElements(selectableSet []string) string {
randomIndex := RandomRange(0, len(selectableSet)-1)
return selectableSet[randomIndex]
}
func userAgentHandler() string {
userAgentValue := randomSelectElements(userAgentList)
return userAgentValue
}
func HeaderGenerator(headers []string) map[string]string {
headerMap := make(map[string]string)
for _, header := range headers {
switch header {
case USER_AGENT:
userAgentValue := userAgentHandler()
headerMap[USER_AGENT] = userAgentValue
default:
break
}
}
return headerMap
}
关于 uule
https://site-analyzer.pro/services-seo/uule/
- uule.proto
syntax = "proto3";
package kratos.api;
option go_package = "uule";
message Uule {
int32 role = 1;
int32 producer = 2;
string canonical_name =4;
}