Avatar

Salut, I'm Julia.

Automated Goodreads End of Year Summary

#python #scrapers

9 min read

At the end of each calendar year, I create a summary of all the books I read. If you're interested, you can check them out here. Much as I like creating these summaries, they are a bit of a chore as I used to do this manually (since it was just a once a year thing). However, having now done this for the past 7 years, I figured I really should just automate this process. 🙄

Goodreads used to have a publicly accessible API, but they're no longer accepting requests for new API keys, so this route was unfortunately a no-go. This leaves me with web scraping, which I've done before in Ruby, but not Python... so this was a nice exercise to learn something new too.

If you're in a hurry, here's the full script.

# goodreads_scraper.py

import time
from datetime import datetime

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Update these inputs
YEAR = '2022'
ROUGH_NO_PAGES_FOR_THIS_YEAR_TO_SCROLL = 3

# Constants
OUTPUT_MD_FILE_PATH = f'books-read-in-{YEAR}.md'
INTRO_PARA_OF_BLOG = f'Here\'s a quick rundown of all the books I read in {YEAR}. Books I particularly enjoyed and would recommend are in bold. Total books read: x. Non-fiction: x. Fiction x.'
MAIN_URL = 'https://www.goodreads.com/review/list/1736497-jules?order=d&ref=nav_mybooks&shelf=read&sort=date_read&utf8=%E2%9C%93'


def get_html(url):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)

    # handle infinite scroll
    lenOfPage = driver.execute_script(
        "window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;"
    )
    page = 0
    while (page < ROUGH_NO_PAGES_FOR_THIS_YEAR_TO_SCROLL):
        lastCount = lenOfPage
        time.sleep(3)
        lenOfPage = driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;"
        )
        page += 1

    # After getting all scrolled pages, extract the source code.
    html_string = driver.page_source
    driver.quit()

    return html_string


def get_rating_from_text(rating_text):
    if rating_text == '' or rating_text == None:
        return '0'

    rating_dict = {'did not like it': '1',
                   'it was ok': '2',
                   'liked it': '3',
                   'really liked it': '4',
                   'it was amazing': '5'}

    return rating_dict[rating_text]


def get_books_data(html_string):
    soup = BeautifulSoup(html_string, 'lxml')

    table = soup.find_all('table', {'id': 'books'})[0]
    table_rows = table.find_all('tr')
    book_list = []

    for tr in table_rows[1:]:
        book_dict = {}

        # parse cover_url
        td = tr.find_all('td', {'class': 'field cover'})[0]
        img = td.find_all('img')[0]
        book_dict['cover_url'] = img['src']

        # parse title and book's url
        td = tr.find_all('td', {'class': 'field title'})[0]
        a_link = td.find_all('a')[0]
        book_dict['title'] = a_link.get('title')
        book_dict['book_url'] = a_link.get('href')

        # parse author and author_url
        td = tr.find_all('td', {'class': 'field author'})[0]
        a_link = td.find_all('a')[0]
        author_name_reversed = a_link.text
        author_name = ' '.join(author_name_reversed.split(', ')[::-1])
        book_dict['author_name'] = author_name
        book_dict['author_url'] = a_link.get('href')

        # parse rating
        td = tr.find_all('td', {'class': 'field rating'})[0]
        span = td.find_all('span', {'class': 'staticStars notranslate'})[0]
        rating_text = span.get('title')
        rating = get_rating_from_text(rating_text)
        book_dict['rating'] = rating

        # parse review
        review = ''
        td = tr.find_all('td', {'class': 'field review'})
        if (len(td) > 0):
            td = td[0]
            span = td.find_all('span')
            if (len(span) > 0):
                span = span[-1]
                lines = [str(i) for i in span.contents]
                review = ' '.join(lines)
        book_dict['review'] = review

        # parse date_read
        td = tr.find_all('td', {'class': 'field date_read'})[0]
        span = td.find_all('span', {'class': 'date_read_value'})[0]
        date_read = span.text
        book_dict['date_read'] = date_read

        # isbn / asin
        td = tr.find_all('td', {'class': 'field isbn'})[0]
        div = td.find_all('div', {'class': 'value'})[0]
        isbn = div.text
        book_dict['isbn'] = isbn.strip()

        td = tr.find_all('td', {'class': 'field asin'})[0]
        div = td.find_all('div', {'class': 'value'})[0]
        asin = div.text
        book_dict['asin'] = asin.strip()

        book_list.append(book_dict)

    return book_list


def validate_date_format(date_text):
    try:
        return datetime.strptime(date_text, '%b %d, %Y').date()
    except ValueError:
        split_date = date_text.split(' ')
        corrected_date = f'{split_date[0]} 01, {split_date[1]}'
        return datetime.strptime(corrected_date, '%b %d, %Y').date()


def filter_and_sort_books(book_list, year):
    filtered_list = [i for i in book_list if year in i['date_read']]
    sorted_list = sorted(
        filtered_list, key=lambda k: validate_date_format(k['date_read']), reverse=False)
    return sorted_list


def create_markdown(filtered_and_sorted_book_list, year, intro_para, md_file_path):

    with open(md_file_path, 'w') as f:
        f.write('---\n')
        f.write(f'title: Books Read in {year}\n')
        f.write(f'description: Quick reviews of the books I read in {year}\n')
        f.write(f'publishedDate: {year}/12/31\n')
        f.write('category: books\n')
        f.write('---\n\n')
        f.write(f'{intro_para}\n\n')

        # loop over book list
        for i in range(len(filtered_and_sorted_book_list)):
            curr_book = filtered_and_sorted_book_list[i]

			# check whether ISBN/ASIN exists
            if curr_book['isbn'] != '':
                amazon_book_id = curr_book['isbn']
            elif curr_book['asin'] != '':
                amazon_book_id = curr_book['asin']
            else:
                amazon_book_id = 'to_fill_in'

            amazon_link = f"https://www.amazon.co.uk/dp/{amazon_book_id}/ref=nosim?tag=bionicjulia-21"
            if curr_book['rating'] == '5':
                f.write(
                    f"- **[{curr_book['title']}]({amazon_link}) by {curr_book['author_name']}**  \n")
            else:
                f.write(
                    f"- [{curr_book['title']}]({amazon_link}) by {curr_book['author_name']}  \n")

            f.write(f"  {curr_book['review']}\n\n")
    print('Markdown file created')


if __name__ == '__main__':
    html_string = get_html(MAIN_URL)
    book_list = get_books_data(html_string)
    filtered_and_sorted_book_list = filter_and_sort_books(book_list, YEAR)
    create_markdown(filtered_and_sorted_book_list, YEAR, INTRO_PARA_OF_BLOG,
                    OUTPUT_MD_FILE_PATH)

Let's break it down. The main steps are to:

  1. Use a scraping package (Selenium) to get the HTML string for my personal Goodreads "read" page.
  2. Use the Beautiful Soup package to pull all of the individual books data out of the HTML string.
  3. Sort the books by date read.
  4. Create a markdown file.

You can effectively see all the steps in the main script that immediately runs when this script is called: python goodreads_scraper.py. Be sure to substitute the MAIN_URL with your own Goodreads profile. You'll need to make your profile public in your privacy settings for this to work.

MAIN_URL = 'https://www.goodreads.com/review/list/1736497-jules?order=d&ref=nav_mybooks&shelf=read&sort=date_read&utf8=%E2%9C%93'

if __name__ == '__main__':
    html_string = get_html(MAIN_URL)
    book_list = get_books_data(html_string)
    filtered_and_sorted_book_list = filter_and_sort_books(book_list, YEAR)
    create_markdown(filtered_and_sorted_book_list, YEAR, INTRO_PARA_OF_BLOG,
                    OUTPUT_MD_FILE_PATH)

Using Selenium to scrape Goodreads

Install the Selenium package for Python. I'm using the Chrome browser to access my Goodreads page. Goodreads uses an infinite scroll by default, to gradually load in the books. Because of this, I had to add a JavaScript script to scroll to the bottom of the page three times (set by the constant ROUGH_NO_PAGES_FOR_THIS_YEAR_TO_SCROLL = 3). Amend this to whatever you need to ensure the infinite scroll covers all the books you've read in the year.

I added a time.sleep(3) to give Goodreads time to load in the new books.

import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

ROUGH_NO_PAGES_FOR_THIS_YEAR_TO_SCROLL = 3

def get_html(url):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(url)

    # handle infinite scroll
    driver.execute_script(
        "window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")

    page = 0
    while (page < ROUGH_NO_PAGES_FOR_THIS_YEAR_TO_SCROLL):
        time.sleep(3)
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        page += 1

    # After getting all scrolled pages, extract the source code.
    html_string = driver.page_source
    driver.quit()

    return html_string

Get each book's data

To do this, I used the Beautiful Soup package. The code below should be fairly self-explanatory (I had never used Beautiful Soup before this and it was easy to get my head around). What we're essentially doing is finding the HTML table which contains all the books, looping through the book list, scraping the data we need for each book from the HTML string (storing it in a dictionary called book_dict), and appending each individual book to an overall list called book_list. I've added more details here than what I actually use, just for reference.

Note that Goodreads doesn't return the stars rating as a number, but instead as text. I therefore added a get_rating_from_text function to convert these to numbers.

from bs4 import BeautifulSoup

def get_rating_from_text(rating_text):
    if rating_text == '' or rating_text == None:
        return '0'

    rating_dict = {'did not like it': '1',
                   'it was ok': '2',
                   'liked it': '3',
                   'really liked it': '4',
                   'it was amazing': '5'}

    return rating_dict[rating_text]


def get_books_data(html_string):
    soup = BeautifulSoup(html_string, 'lxml')

    table = soup.find_all('table', {'id': 'books'})[0]
    table_rows = table.find_all('tr')
    book_list = []

    for tr in table_rows[1:]:
        book_dict = {}

        # parse cover_url
        td = tr.find_all('td', {'class': 'field cover'})[0]
        img = td.find_all('img')[0]
        book_dict['cover_url'] = img['src']

        # parse title and book's url
        td = tr.find_all('td', {'class': 'field title'})[0]
        a_link = td.find_all('a')[0]
        book_dict['title'] = a_link.get('title')
        book_dict['book_url'] = a_link.get('href')

        # parse author and author_url
        td = tr.find_all('td', {'class': 'field author'})[0]
        a_link = td.find_all('a')[0]
        author_name_reversed = a_link.text
        author_name = ' '.join(author_name_reversed.split(', ')[::-1])
        book_dict['author_name'] = author_name
        book_dict['author_url'] = a_link.get('href')

        # parse rating
        td = tr.find_all('td', {'class': 'field rating'})[0]
        span = td.find_all('span', {'class': 'staticStars notranslate'})[0]
        rating_text = span.get('title')
        rating = get_rating_from_text(rating_text)
        book_dict['rating'] = rating

        # parse review
        review = ''
        td = tr.find_all('td', {'class': 'field review'})
        if (len(td) > 0):
            td = td[0]
            span = td.find_all('span')
            if (len(span) > 0):
                span = span[-1]
                lines = [str(i) for i in span.contents]
                review = ' '.join(lines)
        book_dict['review'] = review

        # parse date_read
        td = tr.find_all('td', {'class': 'field date_read'})[0]
        span = td.find_all('span', {'class': 'date_read_value'})[0]
        date_read = span.text
        book_dict['date_read'] = date_read

        # isbn / asin
        td = tr.find_all('td', {'class': 'field isbn'})[0]
        div = td.find_all('div', {'class': 'value'})[0]
        isbn = div.text
        book_dict['isbn'] = isbn.strip()

        td = tr.find_all('td', {'class': 'field asin'})[0]
        div = td.find_all('div', {'class': 'value'})[0]
        asin = div.text
        book_dict['asin'] = asin.strip()

        book_list.append(book_dict)

    return book_list

Sort books

At this point, we've got a list of dictionaries of book data. There's a bug in Goodreads, whereby if the date read of a book is the 1st of the month, it only returns the month and year. I therefore needed to add a validate_date_format function to check and correct for this. I wanted to return my books in chronological order, so that's what the filter_and_sort_books function does.

from datetime import datetime

def validate_date_format(date_text):
    try:
        return datetime.strptime(date_text, '%b %d, %Y').date()
    except ValueError:
        split_date = date_text.split(' ')
        corrected_date = f'{split_date[0]} 01, {split_date[1]}'
        return datetime.strptime(corrected_date, '%b %d, %Y').date()


def filter_and_sort_books(book_list, year):
    filtered_list = [i for i in book_list if year in i['date_read']]
    sorted_list = sorted(
        filtered_list, key=lambda k: validate_date_format(k['date_read']), reverse=False)

    return sorted_list

Create markdown document

The final step is creating the markdown document. My summaries take the same format each year, with a bunch of static text, followed by the list of books.

There's no easy way to count the number of fiction vs. non-fiction books, so this still needs to be done manually. I apply bold formatting to any books I've rated a "5 star", so the create_markdown function takes this into account, in addition to applying my Amazon affiliate link (which is why I need to pull ISBN / ASIN for each book).

YEAR = '2022'
OUTPUT_MD_FILE_PATH = f'books-read-in-{YEAR}.md'
INTRO_PARA_OF_BLOG = f'Here\'s a quick rundown of all the books I read in {YEAR}. Books I particularly enjoyed and would recommend are in bold. Total books read: x. Non-fiction: x. Fiction x.'

def create_markdown(filtered_and_sorted_book_list, year, intro_para, md_file_path):

    with open(md_file_path, 'w') as f:
        f.write('---\n')
        f.write(f'title: Books Read in {year}\n')
        f.write(f'description: Quick reviews of the books I read in {year}\n')
        f.write(f'publishedDate: {year}/12/31\n')
        f.write('category: books\n')
        f.write('---\n\n')
        f.write(f'{intro_para}\n\n')

        # loop over book list
        for i in range(len(filtered_and_sorted_book_list)):
            curr_book = filtered_and_sorted_book_list[i]

			# check whether ISBN/ASIN exists
            if curr_book['isbn'] != '':
                amazon_book_id = curr_book['isbn']
            elif curr_book['asin'] != '':
                amazon_book_id = curr_book['asin']
            else:
                amazon_book_id = 'to_fill_in'

            amazon_link = f"https://www.amazon.co.uk/dp/{amazon_book_id}/ref=nosim?tag=bionicjulia-21"

            if curr_book['rating'] == '5':
                f.write(
                    f"- **[{curr_book['title']}]({amazon_link}) by {curr_book['author_name']}**  \n")
            else:
                f.write(
                    f"- [{curr_book['title']}]({amazon_link}) by {curr_book['author_name']}  \n")

            f.write(f"  {curr_book['review']}\n\n")

    print('Markdown file created')

And that's it. From here, it's just doing a final check for any missing ISBN / ASIN numbers, and filling in the fiction vs. non-fiction count. Happy reading!

© 2016-2024 Julia Tan · Powered by Next JS.