feat: Add scrapping script to get icon list from font awesome website

affinitic · Mar 30, 2023 · 7c1bcfc · 7c1bcfc
1 parent 9ebce68
commit 7c1bcfc
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 0 deletions.
diff --git a/scrapping/README.md b/scrapping/README.md
@@ -0,0 +1,20 @@
+# Scripping icon list
+
+This script is use to scrap icon list with their category from font awesome to generate iconList for search querry
+
+## Install
+1. Init a new virtual env
+```bash
+virtualenv-3.8 .
+```
+2. Install requirements
+```bash
+bin/pip install -r requirements.txt
+```
+3. launch the script
+```bash
+bin/python main.py
+```
+A json file is created with the js object for iconList.js
+
+todo : Generate directly the js file
diff --git a/scrapping/iconList.json b/scrapping/iconList.json
diff --git a/scrapping/main.py b/scrapping/main.py
@@ -0,0 +1,66 @@
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+
+import json
+
+FA_SEARCH_URL = "https://fontawesome.com/search?o=a&m=free&s=solid"
+FA_CAT_URL = FA_SEARCH_URL+"&c={id}"
+
+def tokenize_title(title):
+    if title == "Disaster + Crisis":
+        return "disaster"
+
+    if title == "Genders":
+        return "gender"
+
+    title = title.lower()
+    title = title.replace(" + ", "-")
+    title = title.replace(" ", "-")
+
+    return title
+
+def main():
+    options = Options()
+    options.add_argument('--headless')
+    options.add_argument('--disable-gpu')
+    driver = webdriver.Chrome(options=options)
+    driver.get(FA_SEARCH_URL)
+
+    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "wrap-icons-facet-input")))
+
+    soup = BeautifulSoup(driver.page_source, "html.parser")
+
+    categories = soup.find("div",{"class":"wrap-icons-facets-menu"}).find("ul",{"class":"icons-facets-group-categories"}).find_all("li",{"class":"wrap-icons-facet-input"})
+
+    cat_list = [cat.find("span",{"class":"text-capitalize"}).string for cat in categories]
+
+    icon_list = {
+        tokenize_title(cat):{
+            "title": cat,
+            "list": []
+        }
+        for cat in cat_list
+    }
+
+    for count, cat in enumerate(cat_list):
+        icon_token = tokenize_title(cat)
+        driver.get(FA_CAT_URL.format(id=icon_token))
+        print(f"Accessing {icon_token} page ({count+1}/{len(cat_list)})")
+        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "wrap-icon")))
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        icons = soup.find_all("article",{"class":"wrap-icon"})
+        icon_list[icon_token]["list"] = [{"name": icon.find("span",{"class":"icon-name"}).string} for icon in icons]
+
+
+    driver.quit()
+
+    with open('iconList.json', 'w') as fp:
+        json.dump(icon_list, fp)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scrapping/requirements.txt b/scrapping/requirements.txt
@@ -0,0 +1,2 @@
+selenium==4.8.3
+beautifulsoup4==4.12.0