Skip to content

Commit

Permalink
Merge pull request #85 from YellowBloomKnapsack/feature/smartad
Browse files Browse the repository at this point in the history
Feature/smartad
  • Loading branch information
aparsak authored Jul 31, 2024
2 parents 954656b + 828b432 commit 6aeaf6c
Show file tree
Hide file tree
Showing 17 changed files with 530 additions and 134 deletions.
192 changes: 192 additions & 0 deletions adserver/crawler/crawler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
package crawler

import (
"YellowBloomKnapsack/mini-yektanet/adserver/kvstorage"
"bytes"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"sort"
"strconv"
"strings"
"sync"
"time"

"golang.org/x/net/html"
)

type Crawler struct {
kvstorage kvstorage.KVStorageInterface
persianStopWords map[string]bool
numOftopwords int
urls map[string]int
}

// Publisher IDs.
var publisherIDs = map[string]int{
"varzesh3": 1,
"digikala": 2,
"zoomit": 3,
"sheypoor": 4,
"filimo": 5,
}

func NewCrawler(kvstorage kvstorage.KVStorageInterface) *Crawler {
baseUrl := "http://" + os.Getenv("PUBLISHER_WEBSITE_HOSTNAME") + ":" + os.Getenv("PUBLISHER_WEBSITE_PORT")
return &Crawler{
kvstorage: kvstorage,
persianStopWords: map[string]bool{
"و": true, "در": true, "به": true, "از": true, "که": true, "این": true, "را": true, "اینجا": true,
"با": true, "برای": true, "است": true, "آن": true, "یک": true, "تا": true, "هم": true, "کنیم": true,
"می": true, "بر": true, "بود": true, "شد": true, "یا": true, "وی": true, "اما": true, "داریم": true, "اولین": true,
"اگر": true, "هر": true, "من": true, "ما": true, "شما": true, "او": true, "آنها": true, "دهیم": true, "آخرین": true,
"ایشان": true, "بودن": true, "باشند": true, "نیز": true, "چون": true, "چه": true, "نیست": true, "های": true,
"هیچ": true, "همین": true, "چیزی": true, "دارند": true, "کنند": true, "خواهد": true, "آیا": true, "ها": true,
"کنید": true, "بدانید": true, "خوش": true, "آمدید": true, "خود": true, "زیاد": true, "کم": true, "زیادی": true,
},
numOftopwords: 5,
urls: map[string]int{
baseUrl + "/varzesh3": 1,
baseUrl + "/digikala": 2,
baseUrl + "/zoomit": 3,
baseUrl + "/sheypoor": 4,
baseUrl + "/filimo": 5,
},
}
}

func (c *Crawler) readHTML(url string) ([]byte, error) {
resp, err := http.Get(url)
if err != nil {
return nil, fmt.Errorf("error fetching URL: %v", err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("error fetching URL: received status code %d", resp.StatusCode)
}

content, err := ioutil.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("error reading response body: %v", err)
}

return content, nil
}

// extractText extracts text content from the HTML node tree.
func (crawler *Crawler) extractText(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}

var buf bytes.Buffer
for c := n.FirstChild; c != nil; c = c.NextSibling {
buf.WriteString(crawler.extractText(c))
}
return buf.String()
}

func (c *Crawler) normalizeText(text string) string {
text = strings.ReplaceAll(text, "ي", "ی") // Arabic Yeh to Persian Yeh
text = strings.ReplaceAll(text, "ك", "ک") // Arabic Kaf to Persian Kaf

// Remove punctuation using a regex.
reg, err := regexp.Compile("[^\\p{L}\\p{N}\\s]+")
if err != nil {
fmt.Println("Error compiling regex:", err)
return ""
}
normalizedText := reg.ReplaceAllString(text, " ")

return normalizedText
}

func (c *Crawler) findTopWords(text string) []string {
words := strings.Fields(text)
wordFreq := make(map[string]int)

for _, word := range words {
if !c.persianStopWords[word] {
wordFreq[word]++
}
}

type wordPair struct {
word string
count int
}
var pairs []wordPair
for word, count := range wordFreq {
pairs = append(pairs, wordPair{word, count})
}

sort.Slice(pairs, func(i, j int) bool {
return pairs[i].count > pairs[j].count
})

// Extract the top words.
topWords := []string{}
for i, pair := range pairs {
if i >= c.numOftopwords {
break
}
topWords = append(topWords, pair.word)
}

return topWords
}

func (c *Crawler) Crawl() {
c.crawlOnce()

ticker := time.NewTicker(1 * time.Minute)
defer ticker.Stop()

go func() {
for {
select {
case <-ticker.C:
c.crawlOnce()
}
}
}()
}

func (c *Crawler) crawlOnce() {
var wg sync.WaitGroup

for filePath, publisherID := range c.urls {
wg.Add(1)

go func(filePath string, publisherID int) {
defer wg.Done()

content, err := c.readHTML(filePath)
if err != nil {
log.Printf("%d+%s: Failed to read file: %v", publisherID, filePath, err)
}

node, err := html.Parse(bytes.NewReader(content))
if err != nil {
log.Printf("%d+%s: Failed to parse HTML: %v", publisherID, filePath, err)
}

rawText := c.extractText(node)
normalizedText := c.normalizeText(rawText)

topWords := c.findTopWords(normalizedText)
publisherIDStr := strconv.Itoa(int(publisherID))

c.kvstorage.Set(publisherIDStr, strings.Join(topWords, ","))
log.Printf("%d+%s: %s", publisherID, filePath, strings.Join(topWords, ", "))
}(filePath, publisherID)
}

go func() {
wg.Wait()
}()
}
6 changes: 3 additions & 3 deletions adserver/handlers/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ func (h *AdServerHandler) GetAd(c *gin.Context) {
timer := prometheus.NewTimer(grafana.AdRequestDuration)
defer timer.ObserveDuration()

chosenAd, err := h.logicService.GetBestAd()
publisherId, _ := strconv.Atoi(c.Param("publisherId"))

chosenAd, err := h.logicService.GetBestAd(uint(publisherId))
if err != nil {
c.JSON(http.StatusNotFound, gin.H{})
return
Expand All @@ -47,8 +49,6 @@ func (h *AdServerHandler) GetAd(c *gin.Context) {
clickReqPath := os.Getenv("CLICK_REQ_PATH")
impressionReqPath := os.Getenv("IMPRESSION_REQ_PATH")

publisherId, _ := strconv.Atoi(c.Param("publisherId"))

privateKey := os.Getenv("PRIVATE_KEY")
key, _ := base64.StdEncoding.DecodeString(privateKey)

Expand Down
4 changes: 2 additions & 2 deletions adserver/handlers/handlers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ type MockLogicService struct {
Err error
}

func (m *MockLogicService) GetBestAd() (*dto.AdDTO, error) {
func (m *MockLogicService) GetBestAd(publisherId uint) (*dto.AdDTO, error) {
return m.BestAd, m.Err
}

Expand Down Expand Up @@ -108,7 +108,7 @@ type MockLogicService_Brake struct {
BrakeAdDuration time.Duration
}

func (m *MockLogicService_Brake) GetBestAd() (*dto.AdDTO, error) {
func (m *MockLogicService_Brake) GetBestAd(publisherId uint) (*dto.AdDTO, error) {
return nil, nil
}

Expand Down
52 changes: 52 additions & 0 deletions adserver/kvstorage/kvstorage.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package kvstorage

import (
"context"
"fmt"
"log"

"github.com/redis/go-redis/v9"
)

var ctx = context.Background()

type KVStorageInterface interface {
Get(key string) (string, error)
Set(key string, value string) error
}

type KVStorage struct {
client *redis.Client
}

// NewKVStorage creates a new KVStorage instance with a connected Redis client
func NewKVStorage(addr string) *KVStorage {
client := redis.NewClient(&redis.Options{
Addr: addr,
Password: "", // no password set
DB: 1, // use default DB
})

// Ping to check if connection is successful
_, err := client.Ping(ctx).Result()
if err != nil {
log.Printf("Failed to connect to Redis at %s for key-value storage: %v", addr, err)
}

return &KVStorage{client: client}
}

func (kvs *KVStorage) Get(key string) (string, error) {
val, err := kvs.client.Get(ctx, key).Result()
if err == redis.Nil {
return "", fmt.Errorf("could not find key %s", key) // Key does not exist
} else if err != nil {
return "", err
}
return val, nil
}

func (kvs *KVStorage) Set(key string, value string) error {
err := kvs.client.Set(ctx, key, value, 0).Err()
return err
}
56 changes: 46 additions & 10 deletions adserver/logic/logic.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,17 @@ import (
"net/http"
"os"
"strconv"
"strings"
"time"

"YellowBloomKnapsack/mini-yektanet/adserver/grafana"
"YellowBloomKnapsack/mini-yektanet/adserver/kvstorage"
"YellowBloomKnapsack/mini-yektanet/common/cache"
"YellowBloomKnapsack/mini-yektanet/common/dto"
)

type LogicInterface interface {
GetBestAd() (*dto.AdDTO, error)
GetBestAd(publisherId uint) (*dto.AdDTO, error)
StartTicker()
BrakeAd(adId uint)
}
Expand All @@ -26,18 +28,20 @@ type LogicService struct {
visitedAds []*dto.AdDTO
unvisitedAds []*dto.AdDTO
brakedAdsCache cache.CacheInterface
kvStorage kvstorage.KVStorageInterface
getAdsAPIPath string
interval int
firstChanceMaxImpressions int
}

func NewLogicService(cache cache.CacheInterface) LogicInterface {
func NewLogicService(cache cache.CacheInterface, kvStorage kvstorage.KVStorageInterface) LogicInterface {
interval, _ := strconv.Atoi(os.Getenv("ADS_FETCH_INTERVAL_SECS"))
firstChanceMaxImpressions, _ := strconv.Atoi(os.Getenv("FIRST_CHANCE_MAX_IMPRESSIONS"))
return &LogicService{
visitedAds: make([]*dto.AdDTO, 0),
unvisitedAds: make([]*dto.AdDTO, 0),
brakedAdsCache: cache,
kvStorage: kvStorage,
getAdsAPIPath: "http://" + os.Getenv("PANEL_HOSTNAME") + ":" + os.Getenv("PANEL_PORT") + os.Getenv("GET_ADS_API"),
interval: interval,
firstChanceMaxImpressions: firstChanceMaxImpressions,
Expand Down Expand Up @@ -71,24 +75,56 @@ func (ls *LogicService) randomOn(ads []*dto.AdDTO) *dto.AdDTO {
return ads[rand.IntN(len(ads))]
}

func (ls *LogicService) isValid(ad *dto.AdDTO) bool {
return !ls.brakedAdsCache.IsPresent(strconv.FormatUint(uint64(ad.ID), 10))
func (ls *LogicService) hasIntersection(lhs, rhs []string) bool {
// Create a map to store elements of lhs
elements := make(map[string]struct{})
for _, item := range lhs {
elements[item] = struct{}{}
}

// Check if any element in rhs exists in the map
for _, item := range rhs {
if _, exists := elements[item]; exists {
return true
}
}

return false
}

func (ls *LogicService) isValid(ad *dto.AdDTO, publisherId uint) bool {
isBraked := ls.brakedAdsCache.IsPresent(strconv.FormatUint(uint64(ad.ID), 10))
if isBraked {
return false
}

if len(ad.Keywords) == 0 {
return true
}

publisherIdStr := strconv.FormatUint(uint64(publisherId), 10)
publisherKeywords, err := ls.kvStorage.Get(publisherIdStr)
if err != nil {
return true
}

return ls.hasIntersection(ad.Keywords, strings.Split(publisherKeywords, ","))
}

func (ls *LogicService) validsOn(ads []*dto.AdDTO) []*dto.AdDTO {
func (ls *LogicService) validsOn(ads []*dto.AdDTO, publisherId uint) []*dto.AdDTO {
result := make([]*dto.AdDTO, 0)
for _, ad := range ads {
if ls.isValid(ad) {
if ls.isValid(ad, publisherId) {
result = append(result, ad)
}
}

return result
}

func (ls *LogicService) GetBestAd() (*dto.AdDTO, error) {
validVisitedsAds := ls.validsOn(ls.visitedAds)
validUnvisitedsAds := ls.validsOn(ls.unvisitedAds)
func (ls *LogicService) GetBestAd(publisherId uint) (*dto.AdDTO, error) {
validVisitedsAds := ls.validsOn(ls.visitedAds, publisherId)
validUnvisitedsAds := ls.validsOn(ls.unvisitedAds, publisherId)

if len(validUnvisitedsAds) == 0 && len(validVisitedsAds) == 0 {
log.Println("No ad was found")
Expand Down Expand Up @@ -172,7 +208,7 @@ func (ls *LogicService) updateAdsList() error {
}

func (ls *LogicService) StartTicker() {
log.Println("Starting ticker...")
log.Println("Starting ads fetcher ticker...")
ls.updateAdsList()
go func() {
ticker := time.NewTicker(time.Duration(ls.interval) * time.Second)
Expand Down
Loading

0 comments on commit 6aeaf6c

Please sign in to comment.