Crime Analysis Using R

Crime Analysis Using R

·

15 min read

The tutorial will work through practical examples showing you how to conduct crime analysis using R.

Introduction

Objectives

  • Visualizing spatial and temporal trends of criminal activity
  • Analyzing factors that may affect unlawful behavior

Study Area

Software Needed

Download R and RStudio

First, you'll download and set up R and RStudio, a free integrated development environment for R. RStudio helps you work in R by providing a coding platform with access to CRAN, the Comprehensive R Archive Network, which contains thousands of R libraries, a built-in viewer for charts and graphs, and other useful features.

  • If necessary, download R 3.4.0 or later. Accept all defaults in the installation wizard.
  • If necessary, download RStudio Desktop. Accept all defaults in the installation wizard.

Install R libraries

if(!require(readr)) install.packages("readr")
if(!require(dplyr)) install.packages("dplyr")
if(!require(DT)) install.packages("DT")
if(!require(ggrepel)) install.packages("ggrepel")
if(!require(leaflet)) install.packages("leaflet")

Data Preprocessing

Read the data

Load the data using readr and read_csv().

library(readr)
# path <- "http://spatial.binghamton.edu/projects/crime/data/SF_Crime_2007_2016.csv"
path <- "D:\\Data\\Crime\\SF_Crime_2007_2016.csv"
df <- read_csv(path)

Display Data

Display the data using DT and datatable().

library(DT)
df_sub <- df[1:100,]  # display the first 100 rows
df_sub$Time <- as.character(df_sub$Time) 
datatable(df_sub, options = list(pageLength = 5,scrollX='400px'))

sprintf("Number of Rows in Dataframe: %s", format(nrow(df),big.mark = ","))

Preprocess Data

The All-Caps text is difficult to read. Let's force the text in the appropriate columns into proper case.

# str(df)
proper_case <- function(x) {
  return (gsub("\\b([A-Z])([A-Z]+)", "\\U\\1\\L\\2" , x, perl=TRUE))
}

library(dplyr)
df <- df %>% mutate(Category = proper_case(Category),
                    Descript = proper_case(Descript),
                    PdDistrict = proper_case(PdDistrict),
                    Resolution = proper_case(Resolution),
                    Time = as.character(Time))
df_sub <- df[1:100,]  # display the first 100 rows
datatable(df_sub, options = list(pageLength = 5,scrollX='400px'))

image

Visualize Data

Crime across space

Display crime incident locations on the map using leaflet. Click icons on the map to show incident details.

library(leaflet)

data <- df[1:10000,] # display the first 10,000 rows
data$popup <- paste("<b>Incident #: </b>", data$IncidntNum, "<br>", "<b>Category: </b>", data$Category,
                    "<br>", "<b>Description: </b>", data$Descript,
                    "<br>", "<b>Day of week: </b>", data$DayOfWeek,
                    "<br>", "<b>Date: </b>", data$Date,
                    "<br>", "<b>Time: </b>", data$Time,
                    "<br>", "<b>PD district: </b>", data$PdDistrict,
                    "<br>", "<b>Resolution: </b>", data$Resolution,
                    "<br>", "<b>Address: </b>", data$Address,
                    "<br>", "<b>Longitude: </b>", data$X,
                    "<br>", "<b>Latitude: </b>", data$Y)

leaflet(data, width = "100%") %>% addTiles() %>%
  addTiles(group = "OSM (default)") %>%
  addProviderTiles(provider = "Esri.WorldStreetMap",group = "World StreetMap") %>%
  addProviderTiles(provider = "Esri.WorldImagery",group = "World Imagery") %>%
  # addProviderTiles(provider = "NASAGIBS.ViirsEarthAtNight2012",group = "Nighttime Imagery") %>%
  addMarkers(lng = ~X, lat = ~Y, popup = data$popup, clusterOptions = markerClusterOptions()) %>%
  addLayersControl(
    baseGroups = c("OSM (default)","World StreetMap", "World Imagery"),
    options = layersControlOptions(collapsed = FALSE)
  )

image

Crime Over Time

library(dplyr)

df_crime_daily <- df %>%
  mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
  group_by(Date) %>%
  summarize(count = n()) %>%
  arrange(Date)

library(ggplot2)
library(scales)
plot <- ggplot(df_crime_daily, aes(x = Date, y = count)) +
  geom_line(color = "#F2CA27", size = 0.1) +
  geom_smooth(color = "#1A1A1A") +
  # fte_theme() +
  scale_x_date(breaks = date_breaks("1 year"), labels = date_format("%Y")) +
  labs(x = "Date of Crime", y = "Number of Crimes", title = "Daily Crimes in San Francisco from 2007 – 2016")
plot

image

Aggregate Data

Summarize the data by incident category.

df_category <- sort(table(df$Category),decreasing = TRUE)
df_category <- data.frame(df_category[df_category > 10000])
colnames(df_category) <- c("Category", "Frequency")
df_category$Percentage <- df_category$Frequency / sum(df_category$Frequency)
datatable(df_category, options = list(scrollX='400px'))

image

Create a bar plot based on the incident category.

library(ggplot2)
library(ggrepel)
bp<-ggplot(df_category, aes(x=Category, y=Frequency, fill=Category)) + geom_bar(stat="identity") + 
  theme(axis.text.x=element_blank()) + geom_text_repel(data=df_category, aes(label=Category))
bp

image

Create a pie chart based on the incident category.

bp<-ggplot(df_category, aes(x="", y=Percentage, fill=Category)) + geom_bar(stat="identity") 
pie <- bp + coord_polar("y") 
pie

image

Theft Over Time

Create a chart of crimes (Larceny/Theft) over time.

df_theft <- df %>% filter(grepl("Larceny/Theft", Category))

df_theft_daily <- df_theft %>%
  mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
  group_by(Date) %>%
  summarize(count = n()) %>%
  arrange(Date)

library(ggplot2)
library(scales)
plot <- ggplot(df_theft_daily, aes(x = Date, y = count)) +
  geom_line(color = "#F2CA27", size = 0.1) +
  geom_smooth(color = "#1A1A1A") +
  # fte_theme() +
  scale_x_date(breaks = date_breaks("1 year"), labels = date_format("%Y")) +
  labs(x = "Date of Theft", y = "Number of Thefts", title = "Daily Thefts in San Francisco from 2007 – 2016")
plot

image

Theft Time Heatmap

Aggregate counts of thefts by Day-of-Week and Time to create heat map. Fortunately, the Day-Of-Week part is pre-derived, but Hour is slightly harder.

get_hour <- function(x) {
  return (as.numeric(strsplit(x,":")[[1]][1]))
}

df_theft_time <- df_theft %>%
  mutate(Hour = sapply(Time, get_hour)) %>%
  group_by(DayOfWeek, Hour) %>%
  summarize(count = n())
# df_theft_time %>% head(10)
datatable(df_theft_time, options = list(scrollX='400px'))

image

Reorder and format Factors.

dow_format <- c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday")
hour_format <- c(paste(c(12,1:11),"AM"), paste(c(12,1:11),"PM"))

df_theft_time$DayOfWeek <- factor(df_theft_time$DayOfWeek, level = rev(dow_format))
df_theft_time$Hour <- factor(df_theft_time$Hour, level = 0:23, label = hour_format)

# df_theft_time %>% head(10)
datatable(df_theft_time, options = list(scrollX='400px'))

image

Create Time Heatmap

plot <- ggplot(df_theft_time, aes(x = Hour, y = DayOfWeek, fill = count)) +
  geom_tile() +
  # fte_theme() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.6), legend.title = element_blank(), legend.position="top", legend.direction="horizontal", legend.key.width=unit(2, "cm"), legend.key.height=unit(0.25, "cm"), legend.margin=unit(-0.5,"cm"), panel.margin=element_blank()) +
  labs(x = "Hour of Theft (Local Time)", y = "Day of Week of Theft", title = "Number of Thefts in San Francisco from 2007 – 2016, by Time of Theft") +
  scale_fill_gradient(low = "white", high = "#27AE60", labels = comma)
plot

image

Hmm, why is there a surge at 6-7PM on weekdays?

Arrest Over Time

Create a chart of arrests over time.

df_arrest <- df %>% filter(grepl("Arrest", Resolution))

df_arrest_daily <- df_arrest %>%
  mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
  group_by(Date) %>%
  summarize(count = n()) %>%
  arrange(Date)

library(ggplot2)
library(scales)
plot <- ggplot(df_arrest_daily, aes(x = Date, y = count)) +
  geom_line(color = "#F2CA27", size = 0.1) +
  geom_smooth(color = "#1A1A1A") +
  # fte_theme() +
  scale_x_date(breaks = date_breaks("1 year"), labels = date_format("%Y")) +
  labs(x = "Date of Arrest", y = "# of Police Arrests", title = "Daily Police Arrests in San Francisco from 2007 – 2016")
plot

image

get_hour <- function(x) {
  return (as.numeric(strsplit(x,":")[[1]][1]))
}

df_arrest_time <- df_arrest %>%
  mutate(Hour = sapply(Time, get_hour)) %>%
  group_by(DayOfWeek, Hour) %>%
  summarize(count = n())

dow_format <- c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday")
hour_format <- c(paste(c(12,1:11),"AM"), paste(c(12,1:11),"PM"))

df_arrest_time$DayOfWeek <- factor(df_arrest_time$DayOfWeek, level = rev(dow_format))
df_arrest_time$Hour <- factor(df_arrest_time$Hour, level = 0:23, label = hour_format)

plot <- ggplot(df_arrest_time, aes(x = Hour, y = DayOfWeek, fill = count)) +
  geom_tile() +
  # fte_theme() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.6), legend.title = element_blank(), legend.position="top", legend.direction="horizontal", legend.key.width=unit(2, "cm"), legend.key.height=unit(0.25, "cm"), legend.margin=unit(-0.5,"cm"), panel.margin=element_blank()) +
  labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Number of Police Arrests in San Francisco from 2007 – 2016, by Time of Arrest") +
  scale_fill_gradient(low = "white", high = "#27AE60", labels = comma)
plot

image

Hmm, why is there a surge on Wednesday afternoon, and at 4-5PM on all days? Let's look at subgroups to verify there isn't a latent factor.

Correlation Analysis

Factor by Crime Category

Certain types of crime may be more time dependent. (e.g., more traffic violations when people leave work)

df_top_crimes <- df_arrest %>%
  group_by(Category) %>% 
  summarize(count = n()) %>%
  arrange(desc(count))

datatable(df_top_crimes, options = list(pageLength = 10,scrollX='400px'))

image

df_arrest_time_crime <- df_arrest %>%
  filter(Category %in% df_top_crimes$Category[2:19]) %>%
  mutate(Hour = sapply(Time, get_hour)) %>%
  group_by(Category, DayOfWeek, Hour) %>% 
  summarize(count = n())

df_arrest_time_crime$DayOfWeek <- factor(df_arrest_time_crime$DayOfWeek, level = rev(dow_format))
df_arrest_time_crime$Hour <- factor(df_arrest_time_crime$Hour, level = 0:23, label = hour_format)

datatable(df_arrest_time_crime, options = list(pageLength = 10,scrollX='400px'))

image

plot <- ggplot(df_arrest_time_crime, aes(x = Hour, y = DayOfWeek, fill = count)) +
  geom_tile() +
  # fte_theme() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
  labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Number of Police Arrests in San Francisco from 2007 – 2016, by Category and Time of Arrest") +
  scale_fill_gradient(low = "white", high = "#2980B9") +
  facet_wrap(~ Category, nrow = 6)
plot

image

Good, but the gradients aren't helpful because they are not normalized. We need to normalize the range on each facet. (unfortunately, this makes the value of the gradient unhelpful)

df_arrest_time_crime <- df_arrest_time_crime %>%
  group_by(Category) %>%
  mutate(norm = count/sum(count))

datatable(df_arrest_time_crime, options = list(pageLength = 10,scrollX='400px'))

image

plot <- ggplot(df_arrest_time_crime, aes(x = Hour, y = DayOfWeek, fill = norm)) +
  geom_tile() +
  # fte_theme() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
  labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Police Arrests in San Francisco from 2007 – 2016 by Time of Arrest, Normalized by Type of Crime") +
  scale_fill_gradient(low = "white", high = "#2980B9") +
  facet_wrap(~ Category, nrow = 6)
plot

image

Factor by Police District

Same as above, but with a different facet.

df_arrest_time_district <- df_arrest %>%
  mutate(Hour = sapply(Time, get_hour)) %>%
  group_by(PdDistrict, DayOfWeek, Hour) %>% 
  summarize(count = n()) %>%
  group_by(PdDistrict) %>%
  mutate(norm = count/sum(count))

df_arrest_time_district$DayOfWeek <- factor(df_arrest_time_district$DayOfWeek, level = rev(dow_format))
df_arrest_time_district$Hour <- factor(df_arrest_time_district$Hour, level = 0:23, label = hour_format)

datatable(df_arrest_time_district, options = list(pageLength = 10,scrollX='400px'))

image

plot <- ggplot(df_arrest_time_district, aes(x = Hour, y = DayOfWeek, fill = norm)) +
  geom_tile() +
  # fte_theme() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
  labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Police Arrests in San Francisco from 2007 – 2016 by Time of Arrest, Normalized by Station") +
  scale_fill_gradient(low = "white", high = "#8E44AD") +
  facet_wrap(~ PdDistrict, nrow = 5)
plot

image

Factor by Month

If crime is tied to activities, the period at which activies end may impact.

df_arrest_time_month <- df_arrest %>%
  mutate(Month = format(as.Date(Date, "%m/%d/%Y"), "%B"), Hour = sapply(Time, get_hour)) %>%
  group_by(Month, DayOfWeek, Hour) %>% 
  summarize(count = n()) %>%
  group_by(Month) %>%
  mutate(norm = count/sum(count))

df_arrest_time_month$DayOfWeek <- factor(df_arrest_time_month$DayOfWeek, level = rev(dow_format))
df_arrest_time_month$Hour <- factor(df_arrest_time_month$Hour, level = 0:23, label = hour_format)

# Set order of month facets by chronological order instead of alphabetical
df_arrest_time_month$Month <- factor(df_arrest_time_month$Month,
                                     level = c("January","February","March","April","May","June","July","August","September","October","November","December"))

plot <- ggplot(df_arrest_time_month, aes(x = Hour, y = DayOfWeek, fill = norm)) +
  geom_tile() +
  # fte_theme() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
  labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Police Arrests in San Francisco from 2007 – 2016 by Time of Arrest, Normalized by Month") +
  scale_fill_gradient(low = "white", high = "#E74C3C") +
  facet_wrap(~ Month, nrow = 4)
plot

image

Factor By Year

Perhaps things changed overtime?

df_arrest_time_year <- df_arrest %>%
  mutate(Year = format(as.Date(Date, "%m/%d/%Y"), "%Y"), Hour = sapply(Time, get_hour)) %>%
  group_by(Year, DayOfWeek, Hour) %>% 
  summarize(count = n()) %>%
  group_by(Year) %>%
  mutate(norm = count/sum(count))

df_arrest_time_year$DayOfWeek <- factor(df_arrest_time_year$DayOfWeek, level = rev(dow_format))
df_arrest_time_year$Hour <- factor(df_arrest_time_year$Hour, level = 0:23, label = hour_format)

plot <- ggplot(df_arrest_time_year, aes(x = Hour, y = DayOfWeek, fill = norm)) +
  geom_tile() +
  # fte_theme() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.6, size = 4)) +
  labs(x = "Hour of Arrest (Local Time)", y = "Day of Week of Arrest", title = "Police Arrests in San Francisco from 2007 – 2016 by Time of Arrest, Normalized by Year") +
  scale_fill_gradient(low = "white", high = "#E67E22") +
  facet_wrap(~ Year, nrow = 6)
plot

img

Analyze Crime Hot Spots

  • Create Space Time Cube By Aggregating Points
  • Emerging Hot Spot Analysis

Enhance Data Layer

Add additional attributes to the dataset.

Optimized Hot Spot Analysis

Identify areas with unusually high crime rates.

Correlation Analysis

References

Did you find this article valuable?

Support Qiusheng Wu by becoming a sponsor. Any amount is appreciated!