Skip to content

Commit

Permalink
Support 50 more websites (#389)
Browse files Browse the repository at this point in the history
  • Loading branch information
reaper47 authored Jul 9, 2024
1 parent 8d1cf5c commit 3366bf1
Show file tree
Hide file tree
Showing 100 changed files with 107,651 additions and 142 deletions.
4 changes: 2 additions & 2 deletions .idea/dataSources.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions internal/models/schema-recipe.go
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,7 @@ func (i *Instructions) UnmarshalJSON(data []byte) error {
case string:
parts := strings.Split(strings.TrimSpace(x), "\n")
for _, s := range parts {
s = strings.TrimSpace(s)
if s != "" {
i.Values = append(i.Values, NewHowToStep(strings.TrimSpace(s)))
}
Expand Down
37 changes: 37 additions & 0 deletions internal/scraper/aldi.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package scraper

import (
"github.com/PuerkitoBio/goquery"
"github.com/reaper47/recipya/internal/models"
"github.com/reaper47/recipya/internal/utils/regex"
"strings"
)

func scrapeAldi(root *goquery.Document) (models.RecipeSchema, error) {
rs := models.NewRecipeSchema()
rs.Category.Value = strings.Replace(root.Find(".tab-nav--item.dropdown--list--item.m-active").Text(), "Recipes", "", 1)
rs.Category.Value = strings.TrimSpace(strings.Replace(rs.Category.Value, "selected", "", 1))

content := root.Find("#main-content")
sidebar := root.Find("#sidebar")

rs.Description.Value = getNameContent(root, "description")
rs.Image.Value = content.Find("img").First().AttrOr("src", "")
rs.Name = strings.TrimSpace(content.Find("h1").Text())
rs.Yield.Value = findYield(sidebar.Find("p:contains('Serves')").Text())
getIngredients(&rs, sidebar.Find("ul li"))
getInstructions(&rs, content.Find("ol li"))
getTime(&rs, sidebar.Find("p:contains('Prep Time')"), true)

cook := strings.TrimSpace(sidebar.Find("p:contains('Cook Time')").Text())
if cook != "" {
rs.CookTime = "PT" + regex.Digit.FindString(cook)
if strings.Contains(cook, "min") {
rs.CookTime += "M"
} else {
rs.CookTime += "H"
}
}

return rs, nil
}
34 changes: 34 additions & 0 deletions internal/scraper/alittlebityummy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package scraper

import (
"github.com/PuerkitoBio/goquery"
"github.com/reaper47/recipya/internal/models"
"strings"
)

func scrapeALittleBitYummy(root *goquery.Document) (models.RecipeSchema, error) {
rs, err := parseWebsite(root)
if err != nil {
return models.RecipeSchema{}, err
}

getInstructions(&rs, root.Find("#method ol li"))
rs.TotalTime = ""

if rs.Tools == nil {
rs.Tools = &models.Tools{}
}
root.Find("ul.recipe-equipment-list").First().Find("ul li").Each(func(_ int, sel *goquery.Selection) {
rs.Tools.Values = append(rs.Tools.Values, models.NewHowToTool(sel.Text()))
})

root.Find("div.ingredients-tab-content").First().Children().Each(func(_ int, sel *goquery.Selection) {
name := goquery.NodeName(sel)
if name == "h4" || name == "div" {
s := strings.Join(strings.Fields(sel.Text()), " ")
rs.Ingredients.Values = append(rs.Ingredients.Values, s)
}
})

return rs, nil
}
64 changes: 64 additions & 0 deletions internal/scraper/aniagotuje.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package scraper

import (
"github.com/PuerkitoBio/goquery"
"github.com/reaper47/recipya/internal/models"
"github.com/reaper47/recipya/internal/utils/regex"
"strings"
)

func scrapeAniagotuje(root *goquery.Document) (models.RecipeSchema, error) {
rs := models.NewRecipeSchema()
rs.Description.Value = getNameContent(root, "description")
rs.Keywords.Values = getNameContent(root, "keywords")
rs.Name = getPropertyContent(root, "og:title")
rs.Image.Value = getPropertyContent(root, "og:image")

info := root.Find(".recipe-info")

n := info.Find("strong:contains('przygo')")
if len(n.Nodes) > 0 && n.Nodes[0].NextSibling != nil {
prep := strings.TrimSpace(n.Nodes[0].NextSibling.Data)
if prep != "" {
rs.PrepTime = "PT" + regex.Digit.FindString(prep)
if strings.Contains(prep, "min") {
rs.PrepTime += "M"
} else {
rs.PrepTime += "H"
}
}
}

n = info.Find("strong:contains('gotow')")
if len(n.Nodes) > 0 && n.Nodes[len(n.Nodes)-1].NextSibling != nil {
cook := strings.TrimSpace(n.Nodes[len(n.Nodes)-1].NextSibling.Data)
if cook != "" {
rs.CookTime = "PT" + regex.Digit.FindString(cook)
if strings.Contains(cook, "min") {
rs.CookTime += "M"
} else {
rs.CookTime += "H"
}
}
}

root.Find(".recipe-ing-list").Each(func(_ int, ul *goquery.Selection) {
header := strings.TrimSpace(ul.Prev().Text())
if header != "" {
rs.Ingredients.Values = append(rs.Ingredients.Values, header)
}

ul.Children().Each(func(_ int, li *goquery.Selection) {
rs.Ingredients.Values = append(rs.Ingredients.Values, strings.TrimSpace(li.Text()))
})
})

root.Find("h3").First().NextUntil("h3").Each(func(_ int, sel *goquery.Selection) {
s := strings.TrimSpace(sel.Text())
if s != "" {
rs.Instructions.Values = append(rs.Instructions.Values, models.NewHowToStep(s))
}
})

return rs, nil
}
34 changes: 34 additions & 0 deletions internal/scraper/antilliaans-eten.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package scraper

import (
"github.com/PuerkitoBio/goquery"
"github.com/reaper47/recipya/internal/models"
"strings"
)

func scrapeAntilliaansEten(root *goquery.Document) (models.RecipeSchema, error) {
rs := models.NewRecipeSchema()
rs.Description.Value = getPropertyContent(root, "og:description")
rs.DateModified = getPropertyContent(root, "article:modified_time")
rs.DatePublished = getPropertyContent(root, "article:published_time")
rs.Image.Value = root.Find(".post-image img").AttrOr("src", "")
rs.Name = strings.TrimSpace(root.Find("h1.post-title").Text())
rs.PrepTime = root.Find(`time[itemprop="prepTime"]`).AttrOr("datetime", "")
rs.PrepTime = strings.Replace(rs.PrepTime, "0H", "", 1)

rs.CookTime = root.Find(`time[itemprop="cookTime"]`).AttrOr("datetime", "")
rs.CookTime = strings.Replace(rs.CookTime, "0H", "", 1)

ul := root.Find("h2:contains('Ingrediënten')").First().Next()
getIngredients(&rs, ul.Children())
getInstructions(&rs, ul.NextUntil("hr"))

nodes := root.Find(`meta[property="article:tag"]`)
xk := make([]string, 0, len(nodes.Nodes))
nodes.Each(func(_ int, sel *goquery.Selection) {
xk = append(xk, sel.AttrOr("content", ""))
})
rs.Keywords.Values = strings.Join(xk, ",")

return rs, nil
}
23 changes: 22 additions & 1 deletion internal/scraper/brianlagerstrom.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (

func scrapeBrianLagerstrom(root *goquery.Document) (models.RecipeSchema, error) {
rs := models.NewRecipeSchema()

rs.Description.Value = getItempropContent(root, "description")
rs.Image.Value = getItempropContent(root, "thumbnailUrl")

Expand Down Expand Up @@ -79,6 +78,28 @@ func scrapeBrianLagerstrom(root *goquery.Document) (models.RecipeSchema, error)
}...)
}

if len(rs.Ingredients.Values) == 0 {
root.Find("p:contains('▪')").Each(func(_ int, sel *goquery.Selection) {
s := strings.TrimPrefix(sel.Text(), "▪")
rs.Ingredients.Values = append(rs.Ingredients.Values, strings.TrimSpace(s))
})
}

if len(rs.Instructions.Values) == 0 {
root.Find("p").Each(func(_ int, sel *goquery.Selection) {
s := strings.TrimSpace(sel.Text())
if s == "" {
return
}

dotIndex := strings.Index(s, ".")
if dotIndex != -1 && dotIndex < 3 {
_, after, _ := strings.Cut(s, ".")
rs.Instructions.Values = append(rs.Instructions.Values, models.NewHowToStep(after))
}
})
}

node := root.Find("p:contains('portion')").First()
if node != nil {
rs.Yield.Value = findYield(node.Text())
Expand Down
95 changes: 95 additions & 0 deletions internal/scraper/britishbakels.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package scraper

import (
"github.com/PuerkitoBio/goquery"
"github.com/reaper47/recipya/internal/models"
"strings"
)

func scrapeBritishBakels(root *goquery.Document) (models.RecipeSchema, error) {
rs := models.NewRecipeSchema()
rs.Description.Value = getNameContent(root, "description")
rs.DateModified = getPropertyContent(root, "article:modified_time")
rs.Image.Value = getPropertyContent(root, "og:image")
rs.Name = strings.TrimSpace(root.Find(".content-header ").Text())
rs.Category.Value = root.Find("h4:contains('Category')").Next().Text()

var (
isHeader bool
unit string
ingredient string
numRight int
)

root.Find("div.card:contains('Ingredients')").Find("[id^=tab-ingredients] div").Each(func(_ int, sel *goquery.Selection) {
class := sel.AttrOr("class", "")
if !strings.Contains(class, "text-xs-") {
return
}

s := strings.TrimSpace(sel.Text())
if strings.HasPrefix(s, "Group") || s == "" {
rs.Ingredients.Values = append(rs.Ingredients.Values, s)
return
}

if strings.HasPrefix(s, "Ingredient") {
isHeader = true
return
}

isLeft := strings.Contains(class, "text-xs-left")
if isLeft {
numRight = 0
} else {
numRight += 1

Check warning on line 45 in internal/scraper/britishbakels.go

View workflow job for this annotation

GitHub Actions / build

should replace numRight += 1 with numRight++
}

if numRight == 2 {
return
}

if isHeader {
if unit == "" && !isLeft {
unit = s
return
} else if isLeft {
isHeader = false
} else {
return
}
}

if isLeft {
ingredient = s
}

if numRight == 1 {
rs.Ingredients.Values = append(rs.Ingredients.Values, s+" "+unit+" "+ingredient)
}
})

content := root.Find("div.card:contains('Method')")
nodes := content.Find("ol")
if nodes.Length() == 0 {
text := strings.TrimSpace(content.Find(".card-body").Text())
for _, s := range strings.Split(text, "\n") {
rs.Instructions.Values = append(rs.Instructions.Values, models.NewHowToStep(s))
}
} else {
nodes.Each(func(_ int, sel *goquery.Selection) {
name := strings.TrimSpace(sel.Prev().Text())
var h models.HowToItem
if name != "" {
h.Name = name
}

sel.Children().Each(func(_ int, li *goquery.Selection) {
h.Text = strings.TrimSpace(li.Text())
rs.Instructions.Values = append(rs.Instructions.Values, h)
})
})
}

return rs, nil
}
38 changes: 38 additions & 0 deletions internal/scraper/chetnamakan.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package scraper

import (
"github.com/PuerkitoBio/goquery"
"github.com/reaper47/recipya/internal/models"
"strings"
)

func scrapeChetnamakan(root *goquery.Document) (models.RecipeSchema, error) {
rs := models.NewRecipeSchema()
rs.Description.Value = getNameContent(root, "description")
rs.Image.Value = getPropertyContent(root, "og:image")
rs.DatePublished = getPropertyContent(root, "article:published_time")
rs.DateModified = getPropertyContent(root, "article:modified_time")
rs.Name = root.Find(`h1[itemprop=headline]`).Text()

node := root.Find("strong:contains('Ingredients')")
if node != nil {
for c := node.Nodes[0].NextSibling; c != nil; c = c.NextSibling {
s := strings.TrimSpace(c.Data)
if s != "br" {
rs.Ingredients.Values = append(rs.Ingredients.Values, s)
}
}
}

node = root.Find("strong:contains('Method')")
if node != nil {
for c := node.Nodes[0].NextSibling; c != nil; c = c.NextSibling {
s := strings.TrimSpace(c.Data)
if s != "br" {
rs.Instructions.Values = append(rs.Instructions.Values, models.NewHowToStep(strings.TrimPrefix(s, "–")))
}
}
}

return rs, nil
}
17 changes: 17 additions & 0 deletions internal/scraper/chinesecookingdemystified.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package scraper

import (
"encoding/json"
"github.com/PuerkitoBio/goquery"
"github.com/reaper47/recipya/internal/models"
"github.com/reaper47/recipya/internal/utils/regex"
Expand Down Expand Up @@ -31,5 +32,21 @@ func scrapeChinesecookingdemystified(root *goquery.Document) (models.RecipeSchem

getInstructions(&rs, root.Find("h3:contains('Process')").NextUntil(":not(p)"))

video := root.Find(".youtube-wrap").AttrOr("data-attrs", "")
if video != "" {
var m map[string]string
err = json.Unmarshal([]byte(video), &m)
if err == nil {
v, ok := m["videoId"]
if ok {
rs.Video.Values = append(rs.Video.Values, models.VideoObject{
AtType: "VideoObject",
EmbedURL: "https://youtube.com/embed/" + v,
IsIFrame: true,
})
}
}
}

return rs, nil
}
Loading

0 comments on commit 3366bf1

Please sign in to comment.