From aa0693a77fbf4d012225e31124c98a310bd5b270 Mon Sep 17 00:00:00 2001
From: Richard Torkar <richard.torkar@gmail.com>
Date: Tue, 30 Mar 2021 12:24:36 +0200
Subject: [PATCH 1/4] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 466779a..f275033 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,4 @@
+[![DOI](https://zenodo.org/badge/234703541.svg)](https://zenodo.org/badge/latestdoi/234703541)
 # Feature selection in requirements engineering
 
 ## Docker

From 539cfdb1eae1ad84d36db660ea0778cf53de4b3d Mon Sep 17 00:00:00 2001
From: Richard Torkar <richard.torkar@gmail.com>
Date: Thu, 8 Apr 2021 09:08:41 +0200
Subject: [PATCH 2/4] Minor changes.

---
 docs/index.Rmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.Rmd b/docs/index.Rmd
index 9f35b00..4da166f 100644
--- a/docs/index.Rmd
+++ b/docs/index.Rmd
@@ -800,7 +800,7 @@ d2 %>%
   row_spec(c(1:3,5:6,8,9,13:14,16:17,19:20), bold = TRUE)
 ```
 
-There are two things to note. First, due to us modeling category-specific effects (right table) we see that we have several estimates of interest in `arch`, which is not even a significant parameter in $\mathcal{M}_4$, whcih is lacking category-specific effects. Second, for all category-specific effects we receive a much more fine-grained view of precisely which categories in each predictor are making a difference. 
+There are two things to note. First, due to us modeling category-specific effects (right table) we see that we have several estimates of interest in `arch`, which is not even a significant parameter in $\mathcal{M}_4$, which is lacking category-specific effects. Second, for all category-specific effects we receive a much more fine-grained view of precisely which categories in each predictor are making a difference. 
 
 To start with, let's focus on the right-hand side table consisting of estimates from $\mathcal{M}$, and turn our attention to the other parameters. The `prio_s` parameter, `r round(fixef(M)[6,1], 2)`, would then become `r round(inv_logit_scaled(fixef(M)[6,1]), 2)` when transformed with the inverse logit, i.e., 
 

From 1a2066d665ca1d7188a85f4336f6820cd04a787a Mon Sep 17 00:00:00 2001
From: Richard Torkar <richard.torkar@gmail.com>
Date: Wed, 28 Apr 2021 13:41:02 +0200
Subject: [PATCH 3/4] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f275033..e3751b6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-[![DOI](https://zenodo.org/badge/234703541.svg)](https://zenodo.org/badge/latestdoi/234703541)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4646845.svg)](https://doi.org/10.5281/zenodo.4646845)
 # Feature selection in requirements engineering
 
 ## Docker

From a2eae9f500dcda0bdf0a4cb8f20310624db3c07d Mon Sep 17 00:00:00 2001
From: Richard Torkar <richard.torkar@gmail.com>
Date: Thu, 28 Oct 2021 10:36:58 +0200
Subject: [PATCH 4/4] - Cleanup - Fixed plots

---
 docs/index.Rmd | 69 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/docs/index.Rmd b/docs/index.Rmd
index 4da166f..b0c86ed 100644
--- a/docs/index.Rmd
+++ b/docs/index.Rmd
@@ -156,6 +156,8 @@ First, we see that for the outcome `State` approximately as many features are re
 ```{r echo=FALSE}
 ggplot(d, aes(x=as.factor(State))) + 
   geom_bar() +
+  stat_count(aes(label=..count..), vjust=-1, 
+                          geom="text", position="identity") +
   xlab("") + 
   ylab("Num. features") +
   ggtitle("Predictor: State") +
@@ -192,12 +194,16 @@ Concerning `Business.value` and `Customer.value` they are fairly similar in thei
 ```{r echo=FALSE}
 ggplot(d, aes(x=Business.value)) + 
   geom_bar() +
+  stat_count(aes(label=..count..), vjust=-1, 
+                          geom="text", position="identity") +
   xlab("") + 
   ylab("Num. features") +
   ggtitle("Predictor: Business value") + 
   theme_tufte() + theme(text = element_text(size = 22))
 
 ggplot(d, aes(x=Customer.value)) + 
+  stat_count(aes(label=..count..), vjust=-1, 
+                          geom="text", position="identity") +
   geom_bar() +
   xlab("") + 
   ylab("Num. features") +
@@ -210,8 +216,10 @@ For `Stakeholder` and `Key.customers` we see a strong emphasis on lower numbers,
 ```{r echo=FALSE}
 ggplot(d, aes(x=as.factor(Stakeholders))) +
   geom_bar() +
+  stat_count(aes(label=..count..), vjust=-1, 
+                          geom="text", position="identity") +
   xlab("Num. stakeholders") +
-  ylab("Frequency") +
+  ylab("") +
   ggtitle("Predictor: Stakeholders") +
   theme_tufte() + theme(text = element_text(size = 22))
 
@@ -235,6 +243,8 @@ Finally, for `Architects.involvement` one can see that in the absolute majority
 ```{r echo=FALSE}
 ggplot(d, aes(x=Architects.involvement)) + 
   geom_bar() +
+  stat_count(aes(label=..count..), vjust=-1, 
+                          geom="text", position="identity") +
   xlab("") + 
   ylab("Num. features") +
   ggtitle("Predictor: Architects' involvement") +
@@ -443,16 +453,16 @@ M1 <- brm(State ~ 1 + prio_s + crit + b_val + c_val + sh_s + kc_s + dep + arch,
 
 ```{r M1-diagnostics, echo=TRUE}
 # Check divergences, tree depth, energy
-rstan::check_hmc_diagnostics(eval(M0)$fit)
+rstan::check_hmc_diagnostics(eval(M1)$fit)
 
 # Check rhat and ESS
-if(max(rhat(eval(M0)), na.rm=T) >= 1.01) {
+if(max(rhat(eval(M1)), na.rm=T) >= 1.01) {
   print("Warning: Rhat >=1.01")
 } else {
   print("All Rhat <1.01")
 }
 
-if(min(neff_ratio(eval(M0)), na.rm=T) <= 0.2) {
+if(min(neff_ratio(eval(M1)), na.rm=T) <= 0.2) {
   print("Warning: ESS <=0.2")
 } else {
   print("All ESS >0.2")
@@ -518,16 +528,16 @@ M2 <- brm(State ~ 1 + prio_s + crit + mo(b_val) + mo(c_val) + sh_s + kc_s +
 
 ```{r M2-diagnostics, echo=TRUE}
 # Check divergences, tree depth, energy
-rstan::check_hmc_diagnostics(eval(M0)$fit)
+rstan::check_hmc_diagnostics(eval(M2)$fit)
 
 # Check rhat and ESS
-if(max(rhat(eval(M0)), na.rm=T) >= 1.01) {
+if(max(rhat(eval(M2)), na.rm=T) >= 1.01) {
   print("Warning: Rhat >=1.01")
 } else {
   print("All Rhat <1.01")
 }
 
-if(min(neff_ratio(eval(M0)), na.rm=T) <= 0.2) {
+if(min(neff_ratio(eval(M2)), na.rm=T) <= 0.2) {
   print("Warning: ESS <=0.2")
 } else {
   print("All ESS >0.2")
@@ -592,13 +602,13 @@ M3 <- brm(State ~ 1 + prio_s + crit + b_val + c_val + sh_s + kc_s + dep + arch,
 rstan::check_hmc_diagnostics(eval(M0)$fit)
 
 # Check rhat and ESS
-if(max(rhat(eval(M0)), na.rm=T) >= 1.01) {
+if(max(rhat(eval(M3)), na.rm=T) >= 1.01) {
   print("Warning: Rhat >=1.01")
 } else {
   print("All Rhat <1.01")
 }
 
-if(min(neff_ratio(eval(M0)), na.rm=T) <= 0.2) {
+if(min(neff_ratio(eval(M3)), na.rm=T) <= 0.2) {
   print("Warning: ESS <=0.2")
 } else {
   print("All ESS >0.2")
@@ -662,16 +672,16 @@ M4 <- brm(State ~ 1 + prio_s + crit + b_val + c_val + sh_s + kc_s + dep + arch,
 
 ```{r M4-diagnostics, echo=TRUE}
 # Check divergences, tree depth, energy
-rstan::check_hmc_diagnostics(eval(M0)$fit)
+rstan::check_hmc_diagnostics(eval(M4)$fit)
 
 # Check rhat and ESS
-if(max(rhat(eval(M0)), na.rm=T) >= 1.01) {
+if(max(rhat(eval(M4)), na.rm=T) >= 1.01) {
   print("Warning: Rhat >=1.01")
 } else {
   print("All Rhat <1.01")
 }
 
-if(min(neff_ratio(eval(M0)), na.rm=T) <= 0.2) {
+if(min(neff_ratio(eval(M4)), na.rm=T) <= 0.2) {
   print("Warning: ESS <=0.2")
 } else {
   print("All ESS >0.2")
@@ -734,16 +744,16 @@ M5 <- brm(State ~ 1 + prio_s + crit + cs(b_val) + cs(c_val) + sh_s +
 
 ```{r M5-diagnostics, echo=TRUE}
 # Check divergences, tree depth, energy
-rstan::check_hmc_diagnostics(eval(M0)$fit)
+rstan::check_hmc_diagnostics(eval(M5)$fit)
 
 # Check rhat and ESS
-if(max(rhat(eval(M0)), na.rm=T) >= 1.01) {
+if(max(rhat(eval(M5)), na.rm=T) >= 1.01) {
   print("Warning: Rhat >=1.01")
 } else {
   print("All Rhat <1.01")
 }
 
-if(min(neff_ratio(eval(M0)), na.rm=T) <= 0.2) {
+if(min(neff_ratio(eval(M5)), na.rm=T) <= 0.2) {
   print("Warning: ESS <=0.2")
 } else {
   print("All ESS >0.2")
@@ -872,37 +882,46 @@ One important question we would like to have an answer to is which independent v
 
 ```{r, echo = FALSE}
 plot(ce, plot = FALSE)[[1]] + 
-  scale_fill_colorblind() + 
+  scale_fill_colorblind(labels = c(seq(1,6))) + 
+  scale_color_colorblind(labels = c(seq(1,6))) +
   xlab("Priority (scaled)") + 
-  theme_tufte()
+  theme_tufte(base_size = 18) + 
+  theme(legend.position = "none")
 ```
 
 Concerning `Priority` we see that it has a very large effect for State $6$ (i.e., a feature being released). The higher the priority (the more to the right) the more probability mass is set on State $6$. In the end it has close to $70$% of the probability mass, while the other states are not even close. Also worth noting is how, for State $4$ (the hump), medium priorities seem to be the recipe for reaching this stage. 
 
 ```{r, echo = FALSE}
 plot(ce, plot = FALSE)[[2]] + 
-  scale_fill_colorblind() + 
-  scale_x_continuous(name = "Critical", breaks=c(0,1), labels = c("No", "Yes")) +
-  theme_tufte()
+  scale_fill_colorblind(labels = c(seq(1,6))) + 
+  scale_color_colorblind(labels = c(seq(1,6))) + 
+  scale_x_continuous(name = "Critical", breaks=c(0,1), 
+                     labels = c("No", "Yes")) +
+  theme_tufte(base_size = 18) + ylab("")
 ```
 
 For the predictor `Critical` we see some of the same effects, albeit the uncertainty increases. The clearest effect is visible for State $6$, i.e., going from No to Yes significantly increases the probability, while the opposite holds for States $1$--$3$ (logically so, since if it is critical then a requirement should be released with a higher probability, i.e., Stage $6$).
 
 ```{r, echo = FALSE, warning=FALSE}
 plot(ce, plot = FALSE)[[3]] + 
-  scale_fill_colorblind() + 
+  scale_fill_colorblind(labels = c(seq(1,6))) + 
+  scale_color_colorblind(labels = c(seq(1,6))) +
   xlab("Number of stakeholders") +
-  theme_tufte() +
-  xlim(0, 15)
+  scale_x_continuous(breaks=c(0,2,4,6,8,10) ) +
+  coord_cartesian(xlim=c(0,10)) +
+  theme_tufte(base_size = 18) + 
+  theme(legend.position = "none")
 ```
 
 Concerning `Number of stakeholders`, we see that virtually all states (except State $1$) has a lower probability with increasing number of stakeholders (and more uncertainty is visible). For State $1$, however, an increase in stakeholders leads to an increase in probability. One could claim this is natural since having stakeholders would lead to the requirement being considered in the first place.
 
 ```{r, echo = FALSE}
 plot(ce, plot = FALSE)[[5]] + 
-  scale_fill_colorblind() + 
+  scale_fill_colorblind(labels = c(seq(1,6))) + 
+  scale_color_colorblind(labels = c(seq(1,6))) + 
   scale_x_continuous(name = "Dependency", breaks=c(0,1), labels = c("No", "Yes")) +
-  theme_tufte()
+  theme_tufte(base_size = 18) + 
+  theme(legend.position = "none")  + ylab("")
 ```