Automated Goodreads End of Year Summary
9 min read
At the end of each calendar year, I create a summary of all the books I read. If you're interested, you can check them out here. Much as I like creating these summaries, they are a bit of a chore as I used to do this manually (since it was just a once a year thing). However, having now done this for the past 7 years, I figured I really should just automate this process. 🙄
Goodreads used to have a publicly accessible API, but they're no longer accepting requests for new API keys, so this route was unfortunately a no-go. This leaves me with web scraping, which I've done before in Ruby, but not Python... so this was a nice exercise to learn something new too.
If you're in a hurry, here's the full script.
# goodreads_scraper.py
import time
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
# Update these inputs
YEAR = '2022'
ROUGH_NO_PAGES_FOR_THIS_YEAR_TO_SCROLL = 3
# Constants
OUTPUT_MD_FILE_PATH = f'books-read-in-{YEAR}.md'
INTRO_PARA_OF_BLOG = f'Here\'s a quick rundown of all the books I read in {YEAR}. Books I particularly enjoyed and would recommend are in bold. Total books read: x. Non-fiction: x. Fiction x.'
MAIN_URL = 'https://www.goodreads.com/review/list/1736497-jules?order=d&ref=nav_mybooks&shelf=read&sort=date_read&utf8=%E2%9C%93'
def get_html(url):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url)
# handle infinite scroll
lenOfPage = driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;"
)
page = 0
while (page < ROUGH_NO_PAGES_FOR_THIS_YEAR_TO_SCROLL):
lastCount = lenOfPage
time.sleep(3)
lenOfPage = driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;"
)
page += 1
# After getting all scrolled pages, extract the source code.
html_string = driver.page_source
driver.quit()
return html_string
def get_rating_from_text(rating_text):
if rating_text == '' or rating_text == None:
return '0'
rating_dict = {'did not like it': '1',
'it was ok': '2',
'liked it': '3',
'really liked it': '4',
'it was amazing': '5'}
return rating_dict[rating_text]
def get_books_data(html_string):
soup = BeautifulSoup(html_string, 'lxml')
table = soup.find_all('table', {'id': 'books'})[0]
table_rows = table.find_all('tr')
book_list = []
for tr in table_rows[1:]:
book_dict = {}
# parse cover_url
td = tr.find_all('td', {'class': 'field cover'})[0]
img = td.find_all('img')[0]
book_dict['cover_url'] = img['src']
# parse title and book's url
td = tr.find_all('td', {'class': 'field title'})[0]
a_link = td.find_all('a')[0]
book_dict['title'] = a_link.get('title')
book_dict['book_url'] = a_link.get('href')
# parse author and author_url
td = tr.find_all('td', {'class': 'field author'})[0]
a_link = td.find_all('a')[0]
author_name_reversed = a_link.text
author_name = ' '.join(author_name_reversed.split(', ')[::-1])
book_dict['author_name'] = author_name
book_dict['author_url'] = a_link.get('href')
# parse rating
td = tr.find_all('td', {'class': 'field rating'})[0]
span = td.find_all('span', {'class': 'staticStars notranslate'})[0]
rating_text = span.get('title')
rating = get_rating_from_text(rating_text)
book_dict['rating'] = rating
# parse review
review = ''
td = tr.find_all('td', {'class': 'field review'})
if (len(td) > 0):
td = td[0]
span = td.find_all('span')
if (len(span) > 0):
span = span[-1]
lines = [str(i) for i in span.contents]
review = ' '.join(lines)
book_dict['review'] = review
# parse date_read
td = tr.find_all('td', {'class': 'field date_read'})[0]
span = td.find_all('span', {'class': 'date_read_value'})[0]
date_read = span.text
book_dict['date_read'] = date_read
# isbn / asin
td = tr.find_all('td', {'class': 'field isbn'})[0]
div = td.find_all('div', {'class': 'value'})[0]
isbn = div.text
book_dict['isbn'] = isbn.strip()
td = tr.find_all('td', {'class': 'field asin'})[0]
div = td.find_all('div', {'class': 'value'})[0]
asin = div.text
book_dict['asin'] = asin.strip()
book_list.append(book_dict)
return book_list
def validate_date_format(date_text):
try:
return datetime.strptime(date_text, '%b %d, %Y').date()
except ValueError:
split_date = date_text.split(' ')
corrected_date = f'{split_date[0]} 01, {split_date[1]}'
return datetime.strptime(corrected_date, '%b %d, %Y').date()
def filter_and_sort_books(book_list, year):
filtered_list = [i for i in book_list if year in i['date_read']]
sorted_list = sorted(
filtered_list, key=lambda k: validate_date_format(k['date_read']), reverse=False)
return sorted_list
def create_markdown(filtered_and_sorted_book_list, year, intro_para, md_file_path):
with open(md_file_path, 'w') as f:
f.write('---\n')
f.write(f'title: Books Read in {year}\n')
f.write(f'description: Quick reviews of the books I read in {year}\n')
f.write(f'publishedDate: {year}/12/31\n')
f.write('category: books\n')
f.write('---\n\n')
f.write(f'{intro_para}\n\n')
# loop over book list
for i in range(len(filtered_and_sorted_book_list)):
curr_book = filtered_and_sorted_book_list[i]
# check whether ISBN/ASIN exists
if curr_book['isbn'] != '':
amazon_book_id = curr_book['isbn']
elif curr_book['asin'] != '':
amazon_book_id = curr_book['asin']
else:
amazon_book_id = 'to_fill_in'
amazon_link = f"https://www.amazon.co.uk/dp/{amazon_book_id}/ref=nosim?tag=bionicjulia-21"
if curr_book['rating'] == '5':
f.write(
f"- **[{curr_book['title']}]({amazon_link}) by {curr_book['author_name']}** \n")
else:
f.write(
f"- [{curr_book['title']}]({amazon_link}) by {curr_book['author_name']} \n")
f.write(f" {curr_book['review']}\n\n")
print('Markdown file created')
if __name__ == '__main__':
html_string = get_html(MAIN_URL)
book_list = get_books_data(html_string)
filtered_and_sorted_book_list = filter_and_sort_books(book_list, YEAR)
create_markdown(filtered_and_sorted_book_list, YEAR, INTRO_PARA_OF_BLOG,
OUTPUT_MD_FILE_PATH)
Let's break it down. The main steps are to:
- Use a scraping package (Selenium) to get the HTML string for my personal Goodreads "read" page.
- Use the Beautiful Soup package to pull all of the individual books data out of the HTML string.
- Sort the books by date read.
- Create a markdown file.
You can effectively see all the steps in the main script that immediately runs when this script is called: python goodreads_scraper.py
. Be sure to substitute the MAIN_URL
with your own Goodreads profile. You'll need to make your profile public in your privacy settings for this to work.
MAIN_URL = 'https://www.goodreads.com/review/list/1736497-jules?order=d&ref=nav_mybooks&shelf=read&sort=date_read&utf8=%E2%9C%93'
if __name__ == '__main__':
html_string = get_html(MAIN_URL)
book_list = get_books_data(html_string)
filtered_and_sorted_book_list = filter_and_sort_books(book_list, YEAR)
create_markdown(filtered_and_sorted_book_list, YEAR, INTRO_PARA_OF_BLOG,
OUTPUT_MD_FILE_PATH)
Using Selenium to scrape Goodreads
Install the Selenium package for Python. I'm using the Chrome browser to access my Goodreads page. Goodreads uses an infinite scroll by default, to gradually load in the books. Because of this, I had to add a JavaScript script to scroll to the bottom of the page three times (set by the constant ROUGH_NO_PAGES_FOR_THIS_YEAR_TO_SCROLL = 3
). Amend this to whatever you need to ensure the infinite scroll covers all the books you've read in the year.
I added a time.sleep(3)
to give Goodreads time to load in the new books.
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
ROUGH_NO_PAGES_FOR_THIS_YEAR_TO_SCROLL = 3
def get_html(url):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url)
# handle infinite scroll
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
page = 0
while (page < ROUGH_NO_PAGES_FOR_THIS_YEAR_TO_SCROLL):
time.sleep(3)
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
page += 1
# After getting all scrolled pages, extract the source code.
html_string = driver.page_source
driver.quit()
return html_string
Get each book's data
To do this, I used the Beautiful Soup package. The code below should be fairly self-explanatory (I had never used Beautiful Soup before this and it was easy to get my head around). What we're essentially doing is finding the HTML table which contains all the books, looping through the book list, scraping the data we need for each book from the HTML string (storing it in a dictionary called book_dict
), and appending each individual book to an overall list called book_list
. I've added more details here than what I actually use, just for reference.
Note that Goodreads doesn't return the stars rating as a number, but instead as text. I therefore added a get_rating_from_text
function to convert these to numbers.
from bs4 import BeautifulSoup
def get_rating_from_text(rating_text):
if rating_text == '' or rating_text == None:
return '0'
rating_dict = {'did not like it': '1',
'it was ok': '2',
'liked it': '3',
'really liked it': '4',
'it was amazing': '5'}
return rating_dict[rating_text]
def get_books_data(html_string):
soup = BeautifulSoup(html_string, 'lxml')
table = soup.find_all('table', {'id': 'books'})[0]
table_rows = table.find_all('tr')
book_list = []
for tr in table_rows[1:]:
book_dict = {}
# parse cover_url
td = tr.find_all('td', {'class': 'field cover'})[0]
img = td.find_all('img')[0]
book_dict['cover_url'] = img['src']
# parse title and book's url
td = tr.find_all('td', {'class': 'field title'})[0]
a_link = td.find_all('a')[0]
book_dict['title'] = a_link.get('title')
book_dict['book_url'] = a_link.get('href')
# parse author and author_url
td = tr.find_all('td', {'class': 'field author'})[0]
a_link = td.find_all('a')[0]
author_name_reversed = a_link.text
author_name = ' '.join(author_name_reversed.split(', ')[::-1])
book_dict['author_name'] = author_name
book_dict['author_url'] = a_link.get('href')
# parse rating
td = tr.find_all('td', {'class': 'field rating'})[0]
span = td.find_all('span', {'class': 'staticStars notranslate'})[0]
rating_text = span.get('title')
rating = get_rating_from_text(rating_text)
book_dict['rating'] = rating
# parse review
review = ''
td = tr.find_all('td', {'class': 'field review'})
if (len(td) > 0):
td = td[0]
span = td.find_all('span')
if (len(span) > 0):
span = span[-1]
lines = [str(i) for i in span.contents]
review = ' '.join(lines)
book_dict['review'] = review
# parse date_read
td = tr.find_all('td', {'class': 'field date_read'})[0]
span = td.find_all('span', {'class': 'date_read_value'})[0]
date_read = span.text
book_dict['date_read'] = date_read
# isbn / asin
td = tr.find_all('td', {'class': 'field isbn'})[0]
div = td.find_all('div', {'class': 'value'})[0]
isbn = div.text
book_dict['isbn'] = isbn.strip()
td = tr.find_all('td', {'class': 'field asin'})[0]
div = td.find_all('div', {'class': 'value'})[0]
asin = div.text
book_dict['asin'] = asin.strip()
book_list.append(book_dict)
return book_list
Sort books
At this point, we've got a list of dictionaries of book data. There's a bug in Goodreads, whereby if the date read of a book is the 1st of the month, it only returns the month and year. I therefore needed to add a validate_date_format
function to check and correct for this. I wanted to return my books in chronological order, so that's what the filter_and_sort_books
function does.
from datetime import datetime
def validate_date_format(date_text):
try:
return datetime.strptime(date_text, '%b %d, %Y').date()
except ValueError:
split_date = date_text.split(' ')
corrected_date = f'{split_date[0]} 01, {split_date[1]}'
return datetime.strptime(corrected_date, '%b %d, %Y').date()
def filter_and_sort_books(book_list, year):
filtered_list = [i for i in book_list if year in i['date_read']]
sorted_list = sorted(
filtered_list, key=lambda k: validate_date_format(k['date_read']), reverse=False)
return sorted_list
Create markdown document
The final step is creating the markdown document. My summaries take the same format each year, with a bunch of static text, followed by the list of books.
There's no easy way to count the number of fiction vs. non-fiction books, so this still needs to be done manually. I apply bold formatting to any books I've rated a "5 star", so the create_markdown
function takes this into account, in addition to applying my Amazon affiliate link (which is why I need to pull ISBN / ASIN for each book).
YEAR = '2022'
OUTPUT_MD_FILE_PATH = f'books-read-in-{YEAR}.md'
INTRO_PARA_OF_BLOG = f'Here\'s a quick rundown of all the books I read in {YEAR}. Books I particularly enjoyed and would recommend are in bold. Total books read: x. Non-fiction: x. Fiction x.'
def create_markdown(filtered_and_sorted_book_list, year, intro_para, md_file_path):
with open(md_file_path, 'w') as f:
f.write('---\n')
f.write(f'title: Books Read in {year}\n')
f.write(f'description: Quick reviews of the books I read in {year}\n')
f.write(f'publishedDate: {year}/12/31\n')
f.write('category: books\n')
f.write('---\n\n')
f.write(f'{intro_para}\n\n')
# loop over book list
for i in range(len(filtered_and_sorted_book_list)):
curr_book = filtered_and_sorted_book_list[i]
# check whether ISBN/ASIN exists
if curr_book['isbn'] != '':
amazon_book_id = curr_book['isbn']
elif curr_book['asin'] != '':
amazon_book_id = curr_book['asin']
else:
amazon_book_id = 'to_fill_in'
amazon_link = f"https://www.amazon.co.uk/dp/{amazon_book_id}/ref=nosim?tag=bionicjulia-21"
if curr_book['rating'] == '5':
f.write(
f"- **[{curr_book['title']}]({amazon_link}) by {curr_book['author_name']}** \n")
else:
f.write(
f"- [{curr_book['title']}]({amazon_link}) by {curr_book['author_name']} \n")
f.write(f" {curr_book['review']}\n\n")
print('Markdown file created')
And that's it. From here, it's just doing a final check for any missing ISBN / ASIN numbers, and filling in the fiction vs. non-fiction count. Happy reading!