Skip to content

Commit

Permalink
Merge pull request #8 from Parker-Kasiewicz/hydrator_updates
Browse files Browse the repository at this point in the history
adding hashtag and url extraction for posts
  • Loading branch information
lxcode authored Nov 25, 2024
2 parents b358b11 + cae0f70 commit bfe9547
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 0 deletions.
31 changes: 31 additions & 0 deletions pkg/hydrator/hydrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,29 @@ func (h *Hydrator) flattenFullProfile(profile *bsky.ActorDefs_ProfileViewDetaile
return
}

func (h *Hydrator) flattenFacets(facets []*bsky.RichtextFacet) (hashtags []string, urls []string) {
hashtags = []string{}
urls = []string{}
if facets != nil {
for _, facet := range facets {
if facet != nil {
features := facet.Features
for _, feature := range features {
if feature.RichtextFacet_Tag != nil {
tag := feature.RichtextFacet_Tag.Tag
hashtags = append(hashtags, tag)
}
if feature.RichtextFacet_Link != nil {
url := feature.RichtextFacet_Link.Uri
urls = append(urls, url)
}
}
}
}
}
return
}

func (h *Hydrator) flattenPostView(post *bsky.FeedDefs_PostView) (result map[string]interface{}) {
if post == nil {
return nil
Expand Down Expand Up @@ -287,6 +310,10 @@ func (h *Hydrator) flattenPostView(post *bsky.FeedDefs_PostView) (result map[str
result["Embed"] = h.flattenEmbed(rec.Embed)
}

hashtags, urls := h.flattenFacets(rec.Facets)
result["Hashtags"] = hashtags
result["URLs"] = urls

return
}

Expand Down Expand Up @@ -316,6 +343,10 @@ func (h *Hydrator) flattenPost(post *bsky.FeedPost) (result map[string]interface
result["Embed"] = h.flattenEmbed(post.Embed)
}

hashtags, urls := h.flattenFacets(post.Facets)
result["Hashtags"] = hashtags
result["URLs"] = urls

return
}

Expand Down
14 changes: 14 additions & 0 deletions pkg/output/bq/bq.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/json"
"errors"
"fmt"
"sort"
"strings"

"cloud.google.com/go/bigquery"
Expand Down Expand Up @@ -52,8 +53,21 @@ func New(ctx context.Context, tablePath string, outputChannel chan map[string]in
return &bq, nil
}

func sortSchema(schema bigquery.Schema) {
sort.Slice(schema, func(i, j int) bool {
return schema[i].Name < schema[j].Name
})
for _, field := range schema {
if field.Type == bigquery.RecordFieldType {
sortSchema(field.Schema)
}
}
}

// Helper function to compare two schemas
func schemasAreEqual(schema1, schema2 bigquery.Schema) bool {
sortSchema(schema1)
sortSchema(schema2)
if len(schema1) != len(schema2) {
return false
}
Expand Down
30 changes: 30 additions & 0 deletions pkg/output/bq/schema/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,16 @@ func GetSchema() bigquery.Schema {
"name": "RepostCount",
"type": "INTEGER"
},
{
"mode": "REPEATED",
"name": "Hashtags",
"type": "STRING"
},
{
"mode": "REPEATED",
"name": "URLs",
"type": "STRING"
},
{
"name": "Text",
"type": "STRING"
Expand Down Expand Up @@ -399,6 +409,16 @@ func GetSchema() bigquery.Schema {
"name": "ReplyParentCID",
"type": "STRING"
},
{
"mode": "REPEATED",
"name": "Hashtags",
"type": "STRING"
},
{
"mode": "REPEATED",
"name": "URLs",
"type": "STRING"
},
{
"name": "Text",
"type": "STRING"
Expand Down Expand Up @@ -566,6 +586,16 @@ func GetSchema() bigquery.Schema {
"name": "RepostCount",
"type": "INTEGER"
},
{
"mode": "REPEATED",
"name": "Hashtags",
"type": "STRING"
},
{
"mode": "REPEATED",
"name": "URLs",
"type": "STRING"
},
{
"name": "Text",
"type": "STRING"
Expand Down

0 comments on commit bfe9547

Please sign in to comment.