Skip to content

Commit

Permalink
Improve jamieoliver.com parser
Browse files Browse the repository at this point in the history
  • Loading branch information
reaper47 committed Jul 11, 2024
1 parent ecf26cc commit ae309ad
Show file tree
Hide file tree
Showing 5 changed files with 1,399 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/website/content/about/changelog/v1.2.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ weight: 1
- 🗜️ **Image compression**: Fixed re-compressing images after every edit.
- 📏 **Ingredient conversion**: Fixed incorrect ingredient conversion.
- ⚖️ **Ingredient scaling**: Fixed some ingredient scaling issues.
- 🥳 **jamieoliver.com**: Improved the jamieoliver.com scraper.
- 👑 **kingarthurbaking**: Refined parsing of kingarthurbaking.com.
- 🔨 **Maangchi.com**: Fixed and improved parsing of maangchi.com.
- 🔩 **Missing parenthesis**: Fix missing closing parenthesis to the first column header of the nutrition table.
Expand Down
121 changes: 121 additions & 0 deletions internal/scraper/jamieoliver.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package scraper

import (
"github.com/PuerkitoBio/goquery"
"github.com/reaper47/recipya/internal/models"
"github.com/reaper47/recipya/internal/utils/regex"
"slices"
"strings"
)

func scrapeJamieOliver(root *goquery.Document) (models.RecipeSchema, error) {
rs, err := parseWebsite(root)
if err == nil {
return rs, nil
}

rs = models.NewRecipeSchema()
rs.Description.Value = getNameContent(root, "description")
rs.Keywords.Values = getNameContent(root, "keywords")
rs.Image.Value = getPropertyContent(root, "og:image")
rs.Yield.Value = findYield(strings.TrimSpace(root.Find(".recipe-detail.serves").Text()))
rs.Name = root.Find("h1").First().Text()

getIngredients(&rs, root.Find("ul.ingred-list").Children(), []models.Replace{
{"useFields", ""},
}...)

root.Find("div.method-p > div").Each(func(i int, sel *goquery.Selection) {

Check warning on line 28 in internal/scraper/jamieoliver.go

View workflow job for this annotation

GitHub Actions / build

parameter 'i' seems to be unused, consider removing or renaming it as _
if sel.HasClass("promo-banner") || sel.HasClass("avocado-desktop-foot") ||
sel.HasClass("mobile-related-recipes") {
return
}

if sel.HasClass("tip") {
sel.Find("p").Each(func(_ int, p *goquery.Selection) {
h, _ := p.Html()
h = strings.ReplaceAll(h, "<br/>", "\n")
h = strings.ReplaceAll(h, " ", "")
rs.Instructions.Values = append(rs.Instructions.Values, models.NewHowToStep(h))
})

} else {
h, _ := sel.Html()
parts := strings.Split(h, "<br/>")
for _, part := range parts {
v := strings.TrimSpace(part)
if v != "" {
rs.Instructions.Values = append(rs.Instructions.Values, models.NewHowToStep(v))
}
}
}
})

_, after, ok := strings.Cut(root.Find("link[rel=canonical]").AttrOr("href", ""), "https://www.jamieoliver.com/recipes/")
if ok {
cat, _, _ := strings.Cut(after, "/")
rs.Category.Value = strings.TrimSpace(strings.ReplaceAll(strings.TrimSuffix(cat, "recipes"), "-", " "))
}

cooksIn := strings.TrimSpace(strings.ToLower(root.Find(".recipe-detail.time").Text()))
if cooksIn != "" {
var (
prep string
cook string
)

if strings.Contains(cooksIn, "prep") {
_, prep, _ = strings.Cut(cooksIn, "prep")
}

if strings.Contains(prep, "cook") {
_, cook, _ = strings.Cut(prep, "cook")
}

if prep != "" {
rs.PrepTime = "PT" + regex.Digit.FindString(prep)
if strings.Contains(prep, "min") {
rs.PrepTime += "M"
} else {
rs.PrepTime += "H"
}
} else {
matches := regex.Time.FindAllString(prep, 2)
if matches != nil {
matches = slices.DeleteFunc(matches, func(s string) bool { return s == "" })
}

if len(matches) == 2 {
rs.PrepTime += regex.Digit.FindString(matches[0]) + "H" + regex.Digit.FindString(matches[1]) + "M"
}
}

if cook != "" {
rs.CookTime = "PT" + regex.Digit.FindString(cook)
if strings.Contains(cook, "min") {
rs.CookTime += "M"
} else {
rs.CookTime += "H"
}
}
}

rs.NutritionSchema = &models.NutritionSchema{
Calories: regex.Digit.FindString(root.Find("li[title=Calories] span.top").Text()),
Carbohydrates: regex.Digit.FindString(root.Find("li[title=Carbs] span.top").Text()),
Fat: regex.Digit.FindString(root.Find("li[title=Fat] span.top").Text()),
Fiber: regex.Digit.FindString(root.Find("li[title=Fibre] span.top").Text()),
Protein: regex.Digit.FindString(root.Find("li[title=Protein] span.top").Text()),
SaturatedFat: regex.Digit.FindString(root.Find("li[title=Saturates] span.top").Text()),
Sodium: regex.Digit.FindString(root.Find("li[title=Salt] span.top").Text()),
}

nodes := root.Find(".tags-list a")
xk := make([]string, 0, nodes.Length())
nodes.Each(func(_ int, sel *goquery.Selection) {
xk = append(xk, sel.Text())
})
rs.Keywords.Values = strings.Join(xk, ",")

return rs, nil
}
58 changes: 58 additions & 0 deletions internal/scraper/scraper_J_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,64 @@ func TestScraper_J(t *testing.T) {
Yield: &models.Yield{Value: 6},
},
},
{
name: "jamieoliver2.com",
in: "https://www.jamieoliver.com/recipes/pork-recipes/outrageous-pulled-pork/",
want: models.RecipeSchema{
AtContext: "https://schema.org",
AtType: &models.SchemaType{Value: "Recipe"},
Category: &models.Category{Value: "pork"},
CookTime: "PT5H",
CookingMethod: &models.CookingMethod{},
Cuisine: &models.Cuisine{},
Description: &models.Description{Value: "A batch cook beauty, Jamie's outrageously good pulled pork is the gift that keeps on giving."},
Keywords: &models.Keywords{Values: "Pork,One-pan recipes"},
Image: &models.Image{Value: anUploadedImage.String()},
Ingredients: &models.Ingredients{
Values: []string{
"½ a pork shoulder, bone in (4.5kg)",
"1 whole nutmeg , for grating",
"4 red onions",
"4 large carrots",
"4 eating apples",
"1 bunch of sage (20g)",
"1 bulb of garlic",
},
},
Instructions: &models.Instructions{
Values: []models.HowToItem{
{
Text: "Preheat the oven to 200°C/400°F/gas 6. It’s important to use a snug-fitting roasting tray for this recipe. Sit the pork shoulder in the tray and drizzle with 2 tablespoons each of olive oil and red wine vinegar, finely grate over the whole nutmeg, season generously with sea salt and black pepper, and rub well. Peel and halve the onions. Wash, trim and halve the carrots lengthways. Quarter and core the apples. Pick the sage leaves. Lift up the pork and sit the onions, carrots, apples, sage leaves and whole unpeeled garlic bulb underneath the meat. Roast for 2 hours, then reduce the heat to 150°C/300°F/gas 2 and cook for another 3 hours, or until the meat effortlessly pulls apart, adding splashes of water occasionally to prevent it from drying out, if needed.",
Type: "HowToStep",
},
{
Text: "Lift off all the crackling and put aside. Remove the pork to a board to rest. Spoon the excess fat off the tray into a jam jar (save for tasty cooking another day), then squeeze the soft garlic cloves out of the skins, and mush up the carrots, onions and apples. Return the pork to the tray, then shred and pull apart with two forks, removing any gristly bits and bones. Mix well until beautifully dressed in all the outrageous tray juices, then season to perfection. Snap up the crackling and place back on top, then enjoy as is or hold in the oven until needed. Batch up the extra portions to stash in the fridge or freezer for future meals.",
Type: "HowToStep",
},
{
Text: "Here are my favourite ways to celebrate it:\n\n\n\nPULLED PORK BAP\n\nIn a pan, heat up some pork with a splash of water until piping hot and lightly golden, then pile into a bap or bun with a gesture of wholegrain mustard, some cheese and onion crisps, a wodge of rocket and a few cornichons or pickles.\n\n\n\nPORK NOODLE CUPS\n\nIn a frying pan, heat up some pork with a splash of water, sliced fresh chilli and peanuts till crisp and piping hot, then toss with red wine vinegar, runny honey, and cooked noodles. Serve in an iceberg lettuce cup with a wedge of lime.\n\n\n\nPORK FRIED RICE\n\nIn a frying pan, heat up some pork with a splash of water, hoisin sauce, frozen peas and cooked rice until piping hot, then crack and toss in an egg until cooked. Serve sprinkled with sliced spring onion, with a wedge of orange.\n\n\n\nPORK &amp; BEANS\n\nIn a frying pan, heat up some pork with a splash of water till crisp and piping hot, then remove. Next, simmer tinned cannellini beans, juice and all, until the liquid has reduced. Serve with fresh soft herbs and finely chopped quick pickled red onion.\n\n\n\nPORK, BENEDICT-STYLE\n\nIn a frying pan, heat up some pork with a splash of water until crisp and piping hot, then fry an egg and wilt some baby spinach alongside. Mix a little English mustard into crème fraîche, then layer it all into a toasted and buttered English muffin.\n\n\n\nPULLED PORK PIZZA\n\nEmbellish a margherita pizza by scattering over some pork, sliced fresh chilli and tenderstem broccoli (halved lengthways), before cooking according to the packet instructions, or make your own. Serve with a drizzle of extra virgin olive oil.",
Type: "HowToStep",
},
},
},
Name: "Outrageous pulled pork",
NutritionSchema: &models.NutritionSchema{
Calories: "376",
Carbohydrates: "7.2",
Fat: "26.2",
Fiber: "1.6",
Protein: "28.6",
SaturatedFat: "8.4",
Sodium: "0.4",
},
PrepTime: "PT15M",
ThumbnailURL: &models.ThumbnailURL{},
Tools: &models.Tools{Values: []models.HowToItem{}},
Yield: &models.Yield{Value: 14},
URL: "https://www.jamieoliver.com/recipes/pork-recipes/outrageous-pulled-pork/",
Video: &models.Videos{},
},
},
{
name: "jaimyskitchen.nl",
in: "https://jaimyskitchen.nl/recepten/sajoer-lodeh-indonesisch-groente-recept-met-tofu",
Expand Down
1,217 changes: 1,217 additions & 0 deletions internal/scraper/testdata/jamieoliver2.html

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions internal/scraper/websites.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ func scrapeWebsite(doc *goquery.Document, host string) (models.RecipeSchema, err
}
case 'j':
switch host {
case "jamieoliver":
return scrapeJamieOliver(doc)
case "jaimyskitchen":
return scrapeJaimysKitchen(doc)
case "juliegoodwin":
Expand Down

0 comments on commit ae309ad

Please sign in to comment.