---
title: "Introduction to `guardianapi`"
author: "Evan Odell"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Introduction to guardianapi}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

options("gu.API.key" = "test")
```


# Functions

`guardianapi` contains functions to search and retrieve articles, tags and editions from the [Guardian open data platform](https://open-platform.theguardian.com/documentation/).


Let's look at a few reviewers. For example, I noticed that comedy critic Brian Logan seemed to give out very few five star or one star reviews, so I wanted to see if that was true. I've included all his reviews from 2002--2018

```{r logan-example, eval=FALSE}
library(guardianapi)
library(dplyr)
library(lubridate)
library(ggplot2)

logan_search <- gu_items(query = "profile/brianlogan")

logan_search$star_rating <- as.numeric(logan_search$star_rating)

logan_reviews <- logan_search %>% 
  filter(!is.na(star_rating), 
         web_publication_date >= as.Date("2002-01-01"),
         web_publication_date <= as.Date("2018-12-31"))

logan_reviews$year <- as.factor(year(logan_reviews$web_publication_date))

logan_summary <- logan_reviews %>%
  group_by(year, star_rating) %>%
  summarise(count = n()) %>%
  mutate(perc = count/sum(count)) %>%
  ungroup() %>%
  mutate(star_rating = factor(star_rating, levels = c(5,4,3,2,1)))

p_logan <- ggplot(data = logan_summary,
                  aes(x = year, y = count, group = star_rating)) + 
  geom_line(aes(colour = star_rating), size = 1, alpha = 0.9) + 
  scale_colour_viridis_d(name = "Rating") + 
  labs(x="Year", y="Number of Review with Rating") + 
  theme(axis.text.x = element_text(angle = 45, vjust=0.5))

p_logan

```

```{r logan-example-plot, echo=FALSE, out.width = '100%'}
knitr::include_graphics("logan-plot.png")
```


```{r logan-area, eval=FALSE}
p_logan_area <- ggplot(data = logan_summary,
                  aes(x = year, y = perc, group = star_rating)) + 
  geom_area(aes(fill = star_rating)) + 
  scale_y_continuous(labels = scales::percent) + 
  scale_fill_viridis_d(name = "Rating") + 
  labs(x="Year", y="Number of Review with Rating") + 
  theme(axis.text.x = element_text(angle = 45, vjust=0.5)) 


p_logan_area
```

```{r logan-area-plot, echo=FALSE, out.width = '100%'}
knitr::include_graphics("logan-area.png")
```

As you can see here, Brian Logan is pretty stingy with five star reviews, and didn't give out a single five star rating in all of 2017. Likewise, he hasn't completed panned any act with a single star since 2014.

Now let's take a look at film critic Peter Bradshaw. I've used the same time span, and I've removed the single [0-star rating](https://www.theguardian.com/culture/2002/oct/04/artsfeatures8) given to the 2008 film [Boat Trip](https://www.imdb.com/title/tt0285462/). There are more than four times as many film reviews from Peter Bradshaw as there are comedy reviews from Brian Logan over the same time period.


```{r bradshaw-example, eval=FALSE}
library(dplyr)
library(lubridate)
library(ggplot2)

bradshaw_search <- gu_items(query = "profile/peterbradshaw")

bradshaw_search$star_rating <- as.numeric(bradshaw_search$star_rating)

bradshaw_reviews <- bradshaw_search %>% 
  filter(!is.na(star_rating), star_rating != 0,
         web_publication_date >= as.Date("2002-01-01"),
         web_publication_date <= as.Date("2018-12-31"))

bradshaw_reviews$year <- as.factor(year(bradshaw_reviews$web_publication_date))

bradshaw_summary <- bradshaw_reviews %>%
  group_by(year, star_rating) %>%
  summarise(count = n()) %>%
  mutate(perc = count/sum(count)) %>%
  ungroup() %>%
  mutate(star_rating = factor(star_rating, levels = c(5,4,3,2,1)))

p_bradshaw <- ggplot(data = bradshaw_summary,
                  aes(x = year, y = count, group = star_rating)) + 
  geom_line(aes(colour = star_rating), size = 1, alpha = 0.9) + 
  scale_colour_viridis_d(name = "Rating") + 
  labs(x="Year", y="Number of Review with Rating") + 
  theme(axis.text.x = element_text(angle = 45, vjust=0.5))

p_bradshaw

```

```{r bradshaw-example-plot, echo=FALSE, out.width = '100%'}
knitr::include_graphics("bradshaw-plot.png")
```


```{r bradshaw-area, eval=FALSE}
p_bradshaw_area <- ggplot(data = bradshaw_summary,
                  aes(x = year, y = perc, group = star_rating)) + 
  geom_area(aes(fill = star_rating)) + 
  scale_y_continuous(labels = scales::percent) + 
  scale_fill_viridis_d(name = "Rating") + 
  labs(x="Year", y="Number of Review with Rating") + 
  theme(axis.text.x = element_text(angle = 45, vjust=0.5))

p_bradshaw_area
```

```{r bradshaw-area-plot, echo=FALSE, out.width = '100%'}
knitr::include_graphics("bradshaw-area.png")
```


We can compare the distributions of ratings given by the two critics.

```{r comp-hist, eval=FALSE}

bradshaw_reviews$byline <- "Peter Bradshaw"

logan_reviews$byline <- "Brian Logan"

comp_df <- bind_rows(logan_reviews, bradshaw_reviews) %>%
  mutate(star_rating = as.numeric(star_rating))

comp_df2 <- comp_df %>%
  group_by(star_rating, byline) %>%
  summarise(count = n()) %>% group_by(byline) %>%
  mutate(perc = count/sum(count))

comp_p <- ggplot(comp_df, 
                 aes(x = star_rating, y = ..density.., fill = byline)) + 
  geom_histogram(position="dodge", bins = 5, alpha = 0.5) +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_viridis_d(end = 0.9, option = "inferno") + 
  labs(x = "Star Rating", y = "", fill = "") + 
  theme(legend.position = "bottom") +
  geom_line(aes(x = star_rating, y = perc,
                colour = byline, group = byline), data = comp_df2,
            size = 1) + 
  scale_colour_viridis_d(end = 0.9, option = "inferno")  +
  guides(colour = FALSE)

comp_p
```

```{r bradshaw-logan-comp, echo=FALSE, out.width = '100%'}
knitr::include_graphics("logan-bradshaw-comp.png")
```


We can also use `gu_content()` for more general queries. For example, here's all the articles returned for "relationships" between the two given dates:

```{r relationships-demo, eval=FALSE}
relations <- gu_content(query = "relationships", from_date = "2018-11-30",
                        to_date = "2018-12-30")

tibble::glimpse(relations)
```

```{r relations-read, echo=FALSE, message=TRUE, warning=TRUE}
relations <- readr::read_rds("relations.rds")

relations
```


Use the `tag` parameter to limit articles to particular sections:


```{r relations-sex-demo, eval=FALSE}
relations_sex <- gu_content(query = "relationships", from_date = "2018-11-30",
                            to_date = "2018-12-30", tag = "lifeandstyle/sex")

relations_sex
```

```{r relations-sex-read, echo=FALSE, message=TRUE, warning=TRUE}
relations_sex <- readr::read_rds("relations_sex.rds")

tibble::glimpse(relations_sex)
```