viv/examples/scrape.py

#!/usr/bin/env python3
"""A Simple Script for Extracting Data from a Webpage
This script allows the user to extract data from a webapge
and then export the data to a csv file with column(s).
modified from:
https://medium.com/analytics-vidhya/a-super-easy-python-script-for-web-scraping-that-anybody-can-use-d3bd6ab86c89
"""

__import__("viv").use("requests", "bs4", "rich")  # noqa

import requests
from bs4 import BeautifulSoup
from rich import box
from rich.console import Console
from rich.table import Table

# Put your URL here
url = "https://www.nytimes.com/books/best-sellers/combined-print-and-e-book-nonfiction/"

# Fetching the html
r = requests.get(url)

# Parsing the html
parse = BeautifulSoup(r.content, "html.parser")

# Provide html elements' attributes to extract the data
text1 = list(
    e.get_text().strip() for e in parse.find_all("h3", attrs={"class": "css-5pe77f"})
)
text2 = list(
    e.get_text().strip().replace("by ", "")
    for e in parse.find_all("p", attrs={"class": "css-hjukut"})
)
max_len = max((len(txt) for txt in text1))

print()
table = Table(title="NY Times Best Sellers", box=box.ROUNDED, title_justify="left")
table.add_column(
    "Title",
    justify="right",
    style="cyan",
    no_wrap=True,
)
table.add_column("Author", style="magenta")

for col1, col2 in zip(text1, text2):
    table.add_row(col1, col2)

console = Console()
console.print(table)
add some usage examples for viv 2022-12-19 23:22:23 -06:00			`#!/usr/bin/env python3`
			`"""A Simple Script for Extracting Data from a Webpage`
			`This script allows the user to extract data from a webapge`
			`and then export the data to a csv file with column(s).`
			`modified from:`
			`https://medium.com/analytics-vidhya/a-super-easy-python-script-for-web-scraping-that-anybody-can-use-d3bd6ab86c89`
			`"""`

refactor!: activate -> use 2023-03-15 13:16:09 -05:00			`__import__("viv").use("requests", "bs4", "rich") # noqa`
add some usage examples for viv 2022-12-19 23:22:23 -06:00
			`import requests`
			`from bs4 import BeautifulSoup`
			`from rich import box`
			`from rich.console import Console`
			`from rich.table import Table`

			`# Put your URL here`
			`url = "https://www.nytimes.com/books/best-sellers/combined-print-and-e-book-nonfiction/"`

			`# Fetching the html`
			`r = requests.get(url)`

			`# Parsing the html`
			`parse = BeautifulSoup(r.content, "html.parser")`

			`# Provide html elements' attributes to extract the data`
			`text1 = list(`
			`e.get_text().strip() for e in parse.find_all("h3", attrs={"class": "css-5pe77f"})`
			`)`
			`text2 = list(`
			`e.get_text().strip().replace("by ", "")`
			`for e in parse.find_all("p", attrs={"class": "css-hjukut"})`
			`)`
			`max_len = max((len(txt) for txt in text1))`

			`print()`
			`table = Table(title="NY Times Best Sellers", box=box.ROUNDED, title_justify="left")`
			`table.add_column(`
			`"Title",`
			`justify="right",`
			`style="cyan",`
			`no_wrap=True,`
			`)`
			`table.add_column("Author", style="magenta")`

			`for col1, col2 in zip(text1, text2):`
			`table.add_row(col1, col2)`

			`console = Console()`
			`console.print(table)`