--- title: "DSjobtracker" output: rmarkdown::html_vignette: fig_width: 7 fig_height: 7 vignette: > %\VignetteIndexEntry{DSjobtracker} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = FALSE, message = FALSE ) ``` ```{r setup, echo=FALSE} library(DSjobtracker) data("DStidy_2020") ``` # Getting started with DSjobtracker The package contains two datasets 1. ```DSraw``` : Raw dataset with `r nrow(DSraw)` rows and `r ncol(DSraw)` columns 2. ```DStidy``` : Cleaned tidy dataset with `r nrow(DStidy)` rows and `r ncol(DStidy)` columns Both of these datasets contain information about job vacancies related to data science, which were collected for the span of a month, by searching for specific ```Search_Term``` and then following the search results to gather data manually. # Usage 1. Install the library from github ```{r,eval=FALSE} # install devtools if not already installed # install.packages("devtools") devtools::install_github("thiyangt/DSjobtracker") ``` 2. Load the library ```{r,eval=FALSE} library(DSjobtracker) ``` 3. Load the dataset into your environment ```{r,eval=FALSE} data("DStidy_2020") ``` # Overview of columns ```{r} tibble::glimpse(DStidy_2020) ``` More information on the meanings of the column names can be accessed through the help ```{r,eval=FALSE} ?DStidy_2020 ``` # Examples ## Barplot of top twenty skills required for data science jobs ```{r,warning=FALSE,message=FALSE,dev='png'} library(tidyr) library(magrittr) library(dplyr) library(ggplot2) library(wordcloud2) library(viridis) library(forcats) theme_set(theme_minimal()) skills_long <- DStidy_2020 %>% select(c(R:Bahasa_Malaysia)) %>% pivot_longer(c(R:Bahasa_Malaysia), values_to = "Value", names_to = "Name") %>% mutate(Value = as.numeric(levels(Value))[Value]) %>% group_by(Name) %>% summarize(Total = sum(Value)) %>% arrange(Total) skills_long %>% mutate(Name = factor(Name, levels = .$Name)) %>% top_n(20) %>% ggplot(aes(x = Name, y = Total)) + geom_bar(stat = "identity") + geom_label(aes(label = Total), nudge_y = -10, size = 3.25, label.padding = unit(0.125, "lines") ) + coord_flip() + labs( title = "Top twenty skills required for data science jobs", x = "Skill Required", y = "No of job vacancies" ) ``` ## Wordcloud of software skills ```{r} not_software_columns <- c( "Presentation_Skills", "Data_visualization", "Spreadsheets", "BigData", "Communication", "BigData", "Data_warehouse", "cloud_storage", "Google_Cloud", "Machine_Learning", "Computer_vision", "Deep_Learning", "RDBMS", "web_design_and_development_tools", "AI", "Natural_Language_Processing(NLP)", "graphics_and_design_skills", "Data_marketing", "SEO", "Content_Management", "Data_Pipelines", "MPP_Platforms", "agile_execution", "Data_management", "Data_mining", "Data_science", "Web_Analytic_tools", "IOT", "Numerical_Analysis", "Finance_Knowledge", "Economic", "Investment_Knowledge", "Problem_Solving", "Korean_language", "Team_Handling", "Debtor_reconcilation", "Payroll_management", "Bayesian", "Optimization", "Bahasa_Malaysia" ) ``` ```{r} indicators <- DStidy_2020 %>% select(c(R:Bahasa_Malaysia)) software_indicators <- indicators %>% select(colnames(.)[!colnames(.) %in% not_software_columns]) software_indicators_long <- software_indicators %>% pivot_longer(colnames(.), values_to = "Value", names_to = "Name") %>% mutate(Value = as.numeric(levels(Value))[Value]) %>% group_by(Name) %>% summarize(Total = sum(Value)) %>% arrange(Total) wordcloud2(software_indicators_long %>% transmute(word = Name, freq = log(Total)), size = 0.35, minRotation = pi / 2, maxRotation = pi / 2, color = viridis(nrow(software_indicators_long)), fontFamily = "Montserrat" ) ``` *The log of the counts were used to visualize them better* ## Required experience and the salary ```{r} count_data <- DStidy_2020 %>% select(Experience_Category, Edu_Category) %>% filter(!is.na(Edu_Category)) %>% count(Experience_Category, Edu_Category) max_vacancies <- max(count_data$n) count_data %>% ggplot(aes(x = fct_rev(Experience_Category), y = n, color = Edu_Category, size = n)) + geom_point() + geom_curve( data = tibble( x1 = c(4.4), x2 = c(4), y1 = c(62.5), y2 = c(58) ), aes(x = x1, y = y1, xend = x2, yend = y2), arrow = arrow(length = unit(0.07, "inch")), size = 0.4, color = "gray20", curvature = -0.3 ) + coord_flip() + lims(y = c(0, 100)) + annotate("text", x = 4.65, y = 60, size = 2.8, label = paste("With less than 2 years of experience\n and a BSc degree \n you can still apply for", max_vacancies, "job vacancies") ) + scale_color_brewer(type = "qual", palette = "Set1") + scale_size(guide = "none") + labs( x = "Years of Experience needed", y = "Number of job vacancies", color = "Minimum education qualification" ) + theme(legend.position = "bottom") + guides(color = guide_legend(nrow = 2, title.position = "top")) ``` ## Software Skills needed for each Job Category ```{r,fig.height=14} # radar plot with job category and skills in a radar job_skill_data <- DStidy_2020 %>% select(R:Bahasa_Malaysia,Job_Category) %>% filter(Job_Category != "Unimportant") %>% pivot_longer(c(R:Bahasa_Malaysia),names_to="Name",values_to = "Value") %>% mutate(Value = as.numeric(levels(Value))[Value]) %>% group_by(Job_Category,Name) %>% summarize(Total = sum(Value)) %>% ungroup() %>% filter(Total > 0) %>% mutate(logTotal = log(Total)) %>% ungroup() common_skills <- job_skill_data %>% count(Name) %>% filter(n == 3 & !(Name %in% not_software_columns)) %>% .$Name plot_data <- job_skill_data %>% filter(Name %in% common_skills) %>% mutate(Name = as.numeric(factor(Name,labels = common_skills))) plot_data %>% ggplot(aes(x = Name,y = logTotal,fill = Job_Category,color = Job_Category))+ geom_area(size = 0,position = position_dodge(width=0.9),alpha=0.1) + geom_point(size=0.5) + geom_segment(aes(xend = Name,yend = logTotal,alpha = logTotal), y = 0,size = 1.25)+ scale_x_continuous(labels = common_skills,breaks = 1:length(common_skills)) + theme(axis.text.y = element_blank(), legend.position = "none") + labs(x = NULL, y = NULL) + scale_fill_brewer(palette = "Set1",type = "qual") + coord_polar() + facet_wrap(~ Job_Category,ncol=1) ```