From 3b546f5a765e314811926b91df9a121a760e8f99 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Sun, 11 Nov 2018 20:27:36 -0500
Subject: [PATCH 01/31] brute force effort at grabbing levels

---
 Data Prep/Comparison_functions.R | 117 +++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 Data Prep/Comparison_functions.R

diff --git a/Data Prep/Comparison_functions.R b/Data Prep/Comparison_functions.R
new file mode 100644
index 0000000..ba6c6d0
--- /dev/null
+++ b/Data Prep/Comparison_functions.R	
@@ -0,0 +1,117 @@
+##########################################################################################################
+# THREADNET:  Batch processing for larger data sets
+
+# (c) 2017 Michigan State University. This software may be used according to the terms provided in the
+# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
+# Absolutely no warranty!
+##########################################################################################################
+
+
+library(dplyr)
+
+get_date_subsets <-function(c) {
+b1 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06')
+b2 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01')
+b3 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15')
+b4 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01')
+b5 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31')
+
+rc1 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06')
+rc2 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01')
+rc3 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15')
+rc4 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01')
+rc5 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31')
+
+smh1 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06')
+smh2 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01')
+smh3 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15')
+smh4 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01')
+smh5 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31')
+
+hh1 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06')
+hh2 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01')
+hh3 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15')
+hh4 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01')
+hh5 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31')
+
+}
+
+# c is for the clinic 
+# m is the metric
+compare_periods <-function(c,m){
+
+  # b1 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06')
+  # b2 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01')
+  # b3 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15')
+  # b4 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01')
+  # b5 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31')
+  
+  
+  a1=routine_metric(b1,'Action',m )
+  a2=routine_metric(b2,'Action',m )
+  a3=routine_metric(b3,'Action',m )
+  a4=routine_metric(b4,'Action',m )
+  a5=routine_metric(b5,'Action',m )
+  
+  r1=routine_metric(b1,'Role',m )
+  r2=routine_metric(b2,'Role',m )
+  r3=routine_metric(b3,'Role',m )
+  r4=routine_metric(b4,'Role',m )
+  r5=routine_metric(b5,'Role',m )
+  
+  ws1=routine_metric(b1,'Workstation',m )
+  ws2=routine_metric(b2,'Workstation',m )
+  ws3=routine_metric(b3,'Workstation',m )
+  ws4=routine_metric(b4,'Workstation',m )
+  ws5=routine_metric(b5,'Workstation',m )
+  
+  rc_a1=routine_metric(rc1,'Action',m )
+  rc_a2=routine_metric(rc2,'Action',m )
+  rc_a3=routine_metric(rc3,'Action',m )
+  rc_a4=routine_metric(rc4,'Action',m )
+  rc_a5=routine_metric(rc5,'Action',m )
+  
+  smh_a1=routine_metric(smh1,'Action',m )
+  smh_a2=routine_metric(smh2,'Action',m )
+  smh_a3=routine_metric(smh3,'Action',m )
+  smh_a4=routine_metric(smh4,'Action',m )
+  smh_a5=routine_metric(smh5,'Action',m )
+  
+  hh_a1=routine_metric(hh1,'Action',m )
+  hh_a2=routine_metric(hh2,'Action',m )
+  hh_a3=routine_metric(hh3,'Action',m )
+  hh_a4=routine_metric(hh4,'Action',m )
+  hh_a5=routine_metric(hh5,'Action',m )
+  
+  lex=data.frame(c(a1,a2,a3,a4,a5),
+                 c(hh_a1,hh_a2,hh_a3,hh_a4,hh_a5),
+                 c(rc_a1,rc_a2,rc_a3,rc_a4,rc_a5),
+                 c(smh_a1,smh_a2,smh_a3,smh_a4,smh_a5)
+                 )
+  
+  ws1=ws1[ws1>0]/nrow(b1)
+  ws2=ws2[ws2>0]/nrow(b2)
+  ws3=ws3[ws3>0]/nrow(b3)
+
+}
+
+
+# o is the list of occurrences
+# m is the metric we want to see
+routine_metric <- function(o, cf, m){
+  
+  switch(m,
+         len = nrow(o),
+         lex_size = length(unique(o[[cf]])),
+         first_order = table(o[[cf]]),
+         first_order_sort = sort( table(o[[cf]]), decreasing = T)
+         
+         )
+  
+}
+
+get_range <- function(o, start_date, end_date){
+  
+  o %>% filter(as.Date(tStamp) > start_date & as.Date(tStamp) < end_date)
+  
+}

From d5fa0dda420b6af7652de85d7a5aa10b90d213fc Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 15 Nov 2018 12:51:03 -0500
Subject: [PATCH 02/31] Just trying to learn how this works...

---
 ParallelTest/ptest.R | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 ParallelTest/ptest.R

diff --git a/ParallelTest/ptest.R b/ParallelTest/ptest.R
new file mode 100644
index 0000000..3957931
--- /dev/null
+++ b/ParallelTest/ptest.R
@@ -0,0 +1,26 @@
+##########################################################################################################
+# THREADNET:  Batch processing for larger data sets
+
+# (c) 2017 Michigan State University. This software may be used according to the terms provided in the
+# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
+# Absolutely no warranty!
+##########################################################################################################
+
+library(parallel)
+
+ptest <-function(o) {
+  x<<-0
+  oo<<-o
+  
+  l=1:nrow(o)
+   # return( sapply(l,f1 ) )
+
+  new_o = cbind(o,  sapply(l,f1 ) )
+   return(new_o)
+}
+
+f1 <- function(r){
+  x<<-x+1
+  return(as.character(oo[r,5]))
+}
+

From a7e985ed0b9c4ef3b700b68dfd2cdcac8cc1187f Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 15 Nov 2018 15:38:03 -0500
Subject: [PATCH 03/31] First draft...

---
 Data Prep/ACHR_Batch_PreProcess.R | 541 ++++++++++++++++++++++++++++++
 1 file changed, 541 insertions(+)
 create mode 100644 Data Prep/ACHR_Batch_PreProcess.R

diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
new file mode 100644
index 0000000..b02ee13
--- /dev/null
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -0,0 +1,541 @@
+##########################################################################################################
+# THREADNET:  Batch processing for larger data sets
+#
+#  ACHR_Batch_PreProcess
+#
+# (c) 2018 Michigan State University. This software may be used according to the terms provided in the
+# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
+# Brian Pentland
+# Absolutely no warranty!
+##########################################################################################################
+
+
+# Separate out the two basicfunctions
+# 1) reading/cleaning the occurrences
+# 2) Threading the data by  adding ThreadNum and  SeqNum to  the  threads
+
+
+
+# this function reads the raw data from URMC
+# It returns the  data  frame  and  also saves it as an Rdata file with the  same name as the csv
+read_ACHR_data <- function(fname){
+
+  library(tidyr)
+  library(data.table)
+  library(dplyr)
+  library(ThreadNet)
+  library(ngram)
+  library(lubridate)
+  library(expss)
+  
+  
+  # This code is tailored for reading in new data from URMC, October 2018
+  # Assumes there  is a column  called 'Timestamps" in column 13 and  "Visit_ID"
+  # read the file into data frame
+  d <- fread( paste0(fname, '.csv') )
+  
+  # Sort by visit and timestamp
+  d  <- arrange(d,desc(Visit_ID,asc(Timestamps)))
+  
+  # move and rename the columns, as needed
+  # Mainly need  to get the  Timestamps into the first column and rename them tStamp
+  cn=colnames(d)
+  cnx = cn[c(13,1:12,14:15)]
+  d=setcolorder(d,cnx)
+  setnames(d,'Timestamps','tStamp')
+  setnames(d,'X','seqn')
+  
+  # This converts numbers to char and replaces spaces with underscore
+   d = cleanOccBatch(d)
+  
+  save(d, file=paste0(fname, '.rData'))
+  
+  return(d)
+  
+}
+
+# clean up the raw occurrence data
+# Remove blanks for n-gram functionality
+cleanOccBatch <- function(fileRows){
+  
+  # extract tStamp
+  tStamp <- fileRows$tStamp
+  
+  # First  get  rid of any underscores pre-existing in the data, then
+  # confirm all spaces are converted to underscores in non tStamp columns; set as factors
+  cleanedCF <- data.frame(lapply(fileRows[2:ncol(fileRows)], function(x){ gsub("_","",x)
+                                                                          gsub(" ","_",x)}))
+  
+  # bind tStamp back to cleaned data
+  complete <- cbind(tStamp,cleanedCF)
+  
+  
+  # Old code  for forcing tStamp into a "YMD_HMS" format -- not  needed  for  URMC  EMR  data
+  # complete$tStamp <- as.character(complete$tStamp)
+  # complete$tStamp <- parse_date_time(complete$tStamp, c("dmy HMS", "dmY HMS", "ymd HMS","dmy HM", "dmY HM", "ymd HM"))
+  
+  # add weekday and month
+  # complete$weekday <- as.factor(weekdays(as.Date(complete$tStamp)))
+  # complete$month   <- as.factor(months(as.Date(complete$tStamp)))
+  
+  return(complete)
+}
+  
+
+# This function adds columns  for  the the thread, if necessary
+# Then it sorts by thread and tStamp and  adds thread/sequence numbers to  each thread.
+# It returns  the threaded set of occurrences  and also saves it as  Rdata. 
+thread_occurrences <- function(o,thread_CF,fname='emr'){
+  
+  # call the functions that clean up the occurrences
+  # Column names are hard coded from the UMRC file
+  # EVENT_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group')
+  EVENT_CF = c('Action','Role','Workstation')
+  TN = 'Visit_ID'
+  
+
+  
+  ###  Code for Visit_Role ###
+  
+  # need to add Visit_Role column and use it to create threads.  Each Visit_Role needs a unique threadNum and seqNum
+  # This is likely to be tricky
+  new_occ_VR = unite(new_occ, 'Visit_Role', c('Visit_ID','Role'),sep='_',remove = 'false')
+  
+  # add two columns to the data frame
+  new_occ_VR$threadNum = integer(nrow(new_occ_VR))
+  new_occ_VR$seqNum =   integer(nrow(new_occ_VR))
+  new_occ_VR = new_occ_VR[order(new_occ_VR$Visit_Role,new_occ_VR$tStamp),]
+  tn<<-0
+  
+  # new_occ_VR = new_occ_VR[1:1000,]
+  
+  pov_list = unique(new_occ_VR$Visit_Role)
+
+  
+  start_row=1
+  thrd=1
+  for (p in pov_list){
+    
+    # get the length of the thread
+    tlen = sum(new_occ_VR$Visit_Role==p)
+    
+    # guard against error
+    if (length(tlen)==0) tlen=0
+    if (tlen>0){
+     
+      #compute the index of the end row
+      end_row = start_row+tlen-1
+      
+      # they all get the same thread number and incrementing seqNum
+      new_occ_VR[start_row:end_row, "threadNum"] <- as.matrix(rep(as.integer(thrd),tlen))
+      new_occ_VR[start_row:end_row, "seqNum"] <- seq(tlen)
+      
+      # increment the counters for the next thread
+      start_row = end_row + 1
+      thrd=thrd+1
+    }
+    
+  }
+
+  # # split occ data frame by threadNum to find earliest time value for that thread
+  # # then substract that from initiated relativeTime from above
+  # occ_split = lapply(split(occ, occ$threadNum),
+  #                    function(x) {x$relativeTime = difftime(x$relativeTime,  min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x})
+  # 
+  # # # row bind data frame back together
+  # occ= data.frame(do.call(rbind, occ_split))
+    
+  save(new_occ_VR, file='auditfinal_VR_10292018.rData')
+  
+  
+  
+  # ThreadNet code assumes these columns will be there, so we need to add them
+  # do not use these for visit_role
+  # occ$threadNum = occ$Visit_ID
+  # occ$seqNum = occ$seqn
+  
+ 
+  # because the occurrences are already sorted, and they have sequence numbers, we should be good to go. 
+  # new_occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF))[1:100000,]
+  new_occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF)) 
+  new_occ = combineContextFactors(new_occ,EVENT_CF[1:2],newColName(EVENT_CF[1:2])) 
+  
+  #  get the date only from the timestamp
+  new_occ$ymd <- format(as.POSIXct(new_occ$tStamp),"%Y-%m-%d")
+  
+  # Rename one of  the clinics so it doesn't have an underscore.  
+  new_occ$Clinic = gsub('HH_POB','HHPOB',new_occ$Clinic)
+  
+  # make new columns for clinic + day and events
+  new_occ = unite(new_occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
+  
+  
+
+  # save this intermediate result for later use
+  save(new_occ, file='auditfinal_10022018.rData')
+  
+
+  #  RUN THESE TO BOIL/SAVE THE COMPLEXITY DATA
+  clinic_days = ACHR_batch_clinic_days(new_occ, TN, EVENT_CF)
+  save(clinic_days, file='ACHRV3_clinic_days.rData')
+  
+  visits = ACHR_batch_visits(new_occ, TN, EVENT_CF)
+  save(visits, file='ACHRV3_visits_with_timestamps.rData')
+  
+}
+  
+
+  
+
+###################################################################
+###################################################################
+ACHR_batch_clinic_days <- function(occ,TN, CFs) {
+  
+  # Name for column that has events -- three variations
+  DV1= newColName(CFs[1])
+  DV2= newColName(CFs[1:2])
+  DV3= newColName(CFs[1:3])
+  
+  print(DV1)
+  print(DV2)
+  print(DV3)
+  
+  # name for column that defines the buckets
+  BU = 'Clinic_ymd'
+  
+    # first  get the date only 
+#    occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d")
+    
+    # make new columns as needed for clinic + day and events
+#    occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
+
+    
+
+  # pick subsets -- one visit at a time in this version, but could be more
+  bucket_list <- unique(occ[['Clinic_ymd']])
+  
+  # get the size (number of buckets)
+  N = length(bucket_list)
+  print(N)
+  
+  # pre-allocate the data.table.  Tables are supposed to be faster.
+  ACHR = data.table(bucket=integer(N),
+                    Clinic_date =  character(N),
+                    YMD_date = character(N),
+                    NEvents = integer(N),
+                    NumVisits = numeric(N),
+                    A_NetComplexity=double(N),
+                    AR_NetComplexity=double(N),
+                    ARW_NetComplexity=double(N),
+                    A_Nodes=double(N),
+                    AR_Nodes=double(N),
+                    ARW_Nodes=double(N),
+                    A_Edges=double(N),
+                    AR_Edges=double(N),
+                    ARW_Edges=double(N),
+                    A_CompressRatio = double(N),
+                    AR_CompressRatio = double(N),
+                    ARW_CompressRatio = double(N),
+                    A_Entropy = double(N),
+                    AR_Entropy = double(N),
+                    ARW_Entropy = double(N),
+                    Clinic = character(N),
+                    NumUniqueProcedures = numeric(N),
+                    NumUniqueDiagnosisGroups  = numeric(N), 
+                    NumPhysicians  = numeric(N), 
+                    Weekday  = character(N), 
+                    Month  = character(N)
+  )
+  
+  # Now add columns for the IVs.  There will be three for each IV
+  
+  # Add the IV columns
+  for (cf in CFs){
+    
+    ACHR[, paste0(cf,"_count"):= double(N)]
+    ACHR[, paste0(cf,"_compression"):= double(N)]
+    ACHR[, paste0(cf,"_entropy"):= double(N)]
+    
+  }
+  
+  # loop through the buckets. Result will be data frame with one row per bucket
+  for (i in 1:N){
+    
+    b = i #  as.integer(bucket_list[i])
+    
+    # print once every 10 buckets
+    if (b%%10==0) {print(b)}
+    
+    # select the threads that go in this bucket
+    df = occ[occ[[BU]] ==bucket_list[i],]
+    
+    # bucket number
+    ACHR[b,bucket := b]
+    
+    # length of the thread (number of rows)
+    ACHR[b,NEvents := nrow(df)]
+    
+    # only do the computations if there are more than two occurrences
+    if (nrow(df) > 2) {
+      
+      # compute the duration of the visit in hours
+ #     ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
+      
+      # compressibility of DV
+      ACHR[b,A_CompressRatio := compression_index(df,DV1)]
+      ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
+      ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
+      
+      # NetComplexity of DV
+      # First get the network
+      n = threads_to_network_original(df,TN, DV1)
+      ACHR[b,A_NetComplexity := estimate_network_complexity( n )]
+      ACHR[b,A_Nodes := nrow(n$nodeDF) ]
+      ACHR[b,A_Edges := nrow(n$edgeDF) ]
+      
+      n = threads_to_network_original(df,TN, DV2)
+      ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
+      ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
+      ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
+      
+      n = threads_to_network_original(df,TN, DV3)
+      ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
+      ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
+      ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
+      
+      # get the entropy for AR and ARW
+      ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
+      ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
+      
+      
+      # compute stuff on each context factor
+      for (cf in CFs){
+        
+        # Count the unique elements in each cf
+        ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ]
+        
+        # get the compression
+        ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
+        
+        # get the entropy
+        ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+        
+      }
+    } # kf nrows > 2
+    
+    # Now copy in the rest of data  
+    # this works because one visit is one bucket
+    # count the number of different diagnoses --> typical indicators of complexity
+    
+    
+    ACHR[b,'Clinic_date'  := df[1,'Clinic_ymd']]
+    ACHR[b,'YMD_date'  := df[1,'ymd']]
+    ACHR[b,'NumVisits'  :=  length(unique(df[['Visit_ID']]))]
+    ACHR[b,'Clinic' := df[1,'Clinic']]
+    ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))]
+    ACHR[b,'NumUniqueDiagnosisGroups'  := length(unique(df[['Diagnosis_Group']]))]  
+    ACHR[b,'NumPhysicians'  := length(unique(df[['Physician']]))]  
+    ACHR[b,'Weekday'  := df[1,'Weekday']]
+    ACHR[b,'Month'  := df[1,'Month']]
+    
+    
+    
+  } # loop thru buckets
+  
+  # copy this one for consistency
+  ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ]
+  
+  # return the table
+  return(ACHR)
+}
+
+###################################################################
+###################################################################
+
+
+
+# ACHR stands for Antecedents of Complexity in Healthcare Routines.  
+# This is function is set up to compute process parameters on thousands of patient visits.
+ACHR_batch_visits <- function(occ,TN, CFs) {
+
+  DV1= newColName(CFs[1])
+  DV2= newColName(CFs[1:2])
+  DV3= newColName(CFs[1:3])
+  
+  print(DV1)
+  print(DV2)
+  print(DV3)
+
+# pick subsets -- one visit at a time in this version, but could be more
+bucket_list <- make_buckets_1(occ, 'Visit_ID')
+
+# get the size (number of buckets)
+N = length(bucket_list)
+print(bucket_list)
+
+
+# pre-allocate the data.table.  Tables are supposed to be faster.
+ACHR = data.table(bucket=integer(N),
+                  NEvents = integer(N),
+                  VisitStart= character(N),
+                  VisitStartInt = integer(N),
+                  VisitDuration=double(N),   
+                  A_NetComplexity=double(N),
+                  AR_NetComplexity=double(N),
+                  ARW_NetComplexity=double(N),
+                  A_Nodes=double(N),
+                  AR_Nodes=double(N),
+                  ARW_Nodes=double(N),
+                  A_Edges=double(N),
+                  AR_Edges=double(N),
+                  ARW_Edges=double(N),
+                  A_CompressRatio = double(N),
+                  AR_CompressRatio = double(N),
+                  ARW_CompressRatio = double(N),
+                  A_Entropy = double(N),
+                  AR_Entropy = double(N),
+                  ARW_Entropy = double(N),
+                  Visit_ID  = character(N), 
+                  Subject_ID  = character(N), 
+                  Clinic = character(N),
+                  Procedure = character(N),
+                  Diagnosis = character(N),   # diag in the raw data
+                  Diagnosis_group  = character(N), 
+                  Physician  = character(N), 
+                  Weekday  = character(N), 
+                  Month  = character(N)
+                  )
+
+# Now add columns for the IVs.  There will be three for each IV
+
+# Add the IV columns
+for (cf in CFs){
+
+  ACHR[, paste0(cf,"_count"):= double(N)]
+  ACHR[, paste0(cf,"_compression"):= double(N)]
+  ACHR[, paste0(cf,"_entropy"):= double(N)]
+
+}
+
+# loop through the buckets. Result will be data frame with one row per bucket
+for (i in 1:N){
+
+  b = i #  as.integer(bucket_list[i])
+
+  # print once every 100 visits
+  if (b%%100==0) {print(b)}
+  
+  # select the threads that go in this bucket
+    df = occ[occ[[TN]] ==bucket_list[i],]
+
+    # bucket number
+    ACHR[b,bucket := b]
+
+    # length of the thread (number of rows)
+    ACHR[b,NEvents := nrow(df)]
+
+    # only do the computations if there are more than two occurrences
+    if (nrow(df) > 2) {
+
+      # compute the duration of the visit in hours
+      ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
+     
+    # compressibility of DV
+      ACHR[b,A_CompressRatio := compression_index(df,DV1)]
+      ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
+      ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
+      
+    # NetComplexity of DV
+    # First get the network
+      # NetComplexity of DV
+      # First get the network
+      n = threads_to_network_original(df,TN, DV1)
+      ACHR[b,A_NetComplexity := estimate_network_complexity( n )]
+      ACHR[b,A_Nodes := nrow(n$nodeDF) ]
+      ACHR[b,A_Edges := nrow(n$edgeDF) ]
+      
+      n = threads_to_network_original(df,TN, DV2)
+      ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
+      ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
+      ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
+      
+      n = threads_to_network_original(df,TN, DV3)
+      ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
+      ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
+      ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
+      
+      # get the entropy for AR and ARW
+      ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
+      ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
+      
+    
+  # compute stuff on each context factor
+  for (cf in CFs){
+
+    # Count the unique elements in each cf
+    ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ]
+
+    # get the compression
+    ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
+
+    # get the entropy
+    ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+
+  }
+} # kf nrows > 2
+
+    # Now copy in the rest of data  
+    # this only works because one visit is one bucket
+    
+ #   ACHR[b,'VisitStart'  := as.POSIXct( df[1,'tStamp']) ] 
+    ACHR[b,'VisitStartInt'  :=   df[1,'tStamp'] ] 
+    
+    ACHR[b,'VisitStart'  :=   as.character( df[1,'tStamp'] ) ] 
+    
+    ACHR[b,'Visit_ID'  := df[1,'Visit_ID']]
+    ACHR[b,'Subject_ID'  := df[1,'Subject_ID']]
+    ACHR[b,'Clinic' := df[1,'Clinic']]
+    ACHR[b,'Procedure' := df[1,'Proc']]
+    ACHR[b,'Diagnosis' := df[1,'Diag']]
+    ACHR[b,'Diagnosis_group'  := df[1,'Diagnosis_Group']]
+    ACHR[b,'Physician'  := df[1,'Physician']]
+    ACHR[b,'Weekday'  := df[1,'Weekday']]
+    ACHR[b,'Month'  := df[1,'Month']]
+    
+    
+    
+} # loop thru buckets
+
+# copy this one for consistency
+ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ]
+
+# return the table
+return(ACHR)
+}
+
+
+# Each bucket is a list of thread numbers that can be used to subset the list of occurrences
+make_buckets_1 <- function(o, criteria){
+
+  return( unique(o[[criteria]]) )
+
+}
+
+
+
+
+
+# 
+# make_box_plots <- function(){
+# ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity",
+#           color = "VisitDay",
+#           ylab = "Complexity", xlab = "Month (DRH)")
+# }
+# 
+
+
+
+
+
+get_timeScale <- function(){'hr'}
+

From 6f18e3c3328982b2cb8f32c312e42ef0b2c41df5 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 15 Nov 2018 16:03:41 -0500
Subject: [PATCH 04/31] Making progress...

---
 Data Prep/ACHR_Batch_PreProcess.R | 59 ++++++++++++++++---------------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index b02ee13..a721d94 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -81,36 +81,45 @@ cleanOccBatch <- function(fileRows){
   return(complete)
 }
   
-
-# This function adds columns  for  the the thread, if necessary
+#########################################################################################
+# This function adds columns  for  the the thread, as requested
 # Then it sorts by thread and tStamp and  adds thread/sequence numbers to  each thread.
+# 
+# Typical column names from the UMRC file
+# ALL_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group')
+# thread_CF = c('Visit_ID', 'Role', 'Workstation')
+# event_CF = c('Action','Role','Workstation')
+# TN = 'Visit_ID'
 # It returns  the threaded set of occurrences  and also saves it as  Rdata. 
-thread_occurrences <- function(o,thread_CF,fname='emr'){
+thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){
   
-  # call the functions that clean up the occurrences
-  # Column names are hard coded from the UMRC file
-  # EVENT_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group')
-  EVENT_CF = c('Action','Role','Workstation')
-  TN = 'Visit_ID'
-  
-
+  # these will be  new column names
+  new_thread_name = newColName(THREAD_CF)
+  new_event_name = newColName(EVENT_CF)
   
-  ###  Code for Visit_Role ###
+  # Add names for the new columns, if necessary
+  if  (!(new_thread_name  %in% colnames(occ))) {  occ = combineContextFactors(occ,THREAD_CF,new_thread_name) }
+  if  (!(new_event_name  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CF,new_event_name) }
   
-  # need to add Visit_Role column and use it to create threads.  Each Visit_Role needs a unique threadNum and seqNum
-  # This is likely to be tricky
-  new_occ_VR = unite(new_occ, 'Visit_Role', c('Visit_ID','Role'),sep='_',remove = 'false')
   
-  # add two columns to the data frame
+  # ThreadNet code assumes these columns will be there, so we need to add them
   new_occ_VR$threadNum = integer(nrow(new_occ_VR))
   new_occ_VR$seqNum =   integer(nrow(new_occ_VR))
-  new_occ_VR = new_occ_VR[order(new_occ_VR$Visit_Role,new_occ_VR$tStamp),]
-  tn<<-0
   
-  # new_occ_VR = new_occ_VR[1:1000,]
+  # Special case for  Visit_ID, which is already filled in URMC  EMR data
+  if  (THREAD_CF == 'VISIT_ID') {
+    occ$threadNum = occ$Visit_ID
+    occ$seqNum = occ$seqn
+  }
+  
+  #  Now  sort the  data  set by the new  threadNum and tStamp
+  occ = occ[order(occ[[new_thread_name]],occ$tStamp),]
   
-  pov_list = unique(new_occ_VR$Visit_Role)
-
+  tn<<-0
+  
+  # get the list of unique identifies for the threads. The length of this list is the number of threads
+  pov_list = unique(occ[[new_thread_name]])
+  print(paste('Number of threads in this POV: ', length(pov_list)))
   
   start_row=1
   thrd=1
@@ -148,17 +157,9 @@ thread_occurrences <- function(o,thread_CF,fname='emr'){
   save(new_occ_VR, file='auditfinal_VR_10292018.rData')
   
   
-  
-  # ThreadNet code assumes these columns will be there, so we need to add them
-  # do not use these for visit_role
-  # occ$threadNum = occ$Visit_ID
-  # occ$seqNum = occ$seqn
+
   
  
-  # because the occurrences are already sorted, and they have sequence numbers, we should be good to go. 
-  # new_occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF))[1:100000,]
-  new_occ = combineContextFactors(occ,EVENT_CF,newColName(EVENT_CF)) 
-  new_occ = combineContextFactors(new_occ,EVENT_CF[1:2],newColName(EVENT_CF[1:2])) 
   
   #  get the date only from the timestamp
   new_occ$ymd <- format(as.POSIXct(new_occ$tStamp),"%Y-%m-%d")

From 92f6d47fba49b00871a0cd0bb4f2a55be6638e76 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 15 Nov 2018 17:39:42 -0500
Subject: [PATCH 05/31] Making progress...

---
 Data Prep/ACHR_Batch_PreProcess.R | 104 ++++++++++++++++++------------
 1 file changed, 62 insertions(+), 42 deletions(-)

diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index a721d94..4f99de3 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -14,7 +14,25 @@
 # 1) reading/cleaning the occurrences
 # 2) Threading the data by  adding ThreadNum and  SeqNum to  the  threads
 
+redo_ACHR_data_from_scratch  <- function(fname){
 
+  # Pick you point of view, or multiple POV...
+  THREAD_CF = c('VISIT_ID')
+  EVENT_CF = c('Action','Role','Workstation')
+  
+  #  first read the data
+  o = read_ACHR_data( fname )
+  
+  # Thread occurrences for each POV  
+  occ = thread_occurrences( o,  THREAD_CF, EVENT_CF ,fname )
+  
+  #  Now aggregate by clinic-day for  each POV
+  clinic_days = ACHR_batch_clinic_days(occ, THREAD_CF, EVENT_CF, fname)
+  
+  # Aggregate by visit for each POV
+  visits = ACHR_batch_visits(occ, THREAD_CF, EVENT_CF, fname)
+  
+}
 
 # this function reads the raw data from URMC
 # It returns the  data  frame  and  also saves it as an Rdata file with the  same name as the csv
@@ -93,14 +111,18 @@ cleanOccBatch <- function(fileRows){
 # It returns  the threaded set of occurrences  and also saves it as  Rdata. 
 thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){
   
+  print(paste('Number of occurrences: ', nrow(occ)))
+  
   # these will be  new column names
-  new_thread_name = newColName(THREAD_CF)
-  new_event_name = newColName(EVENT_CF)
+  new_thread_col = newColName(THREAD_CF)
+  new_event_col = newColName(EVENT_CF)
   
-  # Add names for the new columns, if necessary
-  if  (!(new_thread_name  %in% colnames(occ))) {  occ = combineContextFactors(occ,THREAD_CF,new_thread_name) }
-  if  (!(new_event_name  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CF,new_event_name) }
+  print(paste('new_thread_col: ',new_thread_col))
+  print(paste('new_event_col: ', new_event_col))
   
+  # Add names for the new columns, if necessary
+  if  (!(new_thread_col  %in% colnames(occ))) {  occ = combineContextFactors(occ,THREAD_CF,new_thread_col) }
+  if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CF,new_event_col) }
   
   # ThreadNet code assumes these columns will be there, so we need to add them
   new_occ_VR$threadNum = integer(nrow(new_occ_VR))
@@ -110,15 +132,21 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){
   if  (THREAD_CF == 'VISIT_ID') {
     occ$threadNum = occ$Visit_ID
     occ$seqNum = occ$seqn
-  }
+    
+    #  say how many there  are... 
+    print(paste('Number of threads (visits) in this POV: ', length(unique( occ$Visit_ID ))))
+    
+    #  Now  sort the  data  set by the new  threadNum and tStamp
+    occ = occ[order(occ[[new_thread_col]],occ$tStamp),]
+    
+  } 
+  else {
   
   #  Now  sort the  data  set by the new  threadNum and tStamp
-  occ = occ[order(occ[[new_thread_name]],occ$tStamp),]
-  
-  tn<<-0
+  occ = occ[order(occ[[new_thread_col]],occ$tStamp),]
   
   # get the list of unique identifies for the threads. The length of this list is the number of threads
-  pov_list = unique(occ[[new_thread_name]])
+  pov_list = unique(occ[[new_thread_col]])
   print(paste('Number of threads in this POV: ', length(pov_list)))
   
   start_row=1
@@ -126,7 +154,7 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){
   for (p in pov_list){
     
     # get the length of the thread
-    tlen = sum(new_occ_VR$Visit_Role==p)
+    tlen = sum(occ[[new_thread_col]]==p)
     
     # guard against error
     if (length(tlen)==0) tlen=0
@@ -136,16 +164,36 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){
       end_row = start_row+tlen-1
       
       # they all get the same thread number and incrementing seqNum
-      new_occ_VR[start_row:end_row, "threadNum"] <- as.matrix(rep(as.integer(thrd),tlen))
-      new_occ_VR[start_row:end_row, "seqNum"] <- seq(tlen)
+      occ[start_row:end_row, "threadNum"] <- as.matrix(rep(as.integer(thrd),tlen))
+      occ[start_row:end_row, "seqNum"] <- seq(tlen)
       
       # increment the counters for the next thread
       start_row = end_row + 1
       thrd=thrd+1
+      
+      if (thrd %% 1000 ==0) print(paste0('Thread count =  ',thrd))
+      
     }
-    
   }
+ }
+  
+  ### This is URMC specific:   For all POV combinations, add the clinic and clinic_day
+  
+  #  get the date only from the timestamp
+  occ$ymd <- format(as.POSIXct(new_occ$tStamp),"%Y-%m-%d")
+  
+  # make new columns for clinic + day and events
+   occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
+  
+    
+   # Now save the result for  later and return it, as well. 
+  save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata'))
+  print(paste('Saved threaded  occurrences: ', nrow(occ)))
+  
+  return(occ)
+  
 
+  ############   extra  stuff not used  #############
   # # split occ data frame by threadNum to find earliest time value for that thread
   # # then substract that from initiated relativeTime from above
   # occ_split = lapply(split(occ, occ$threadNum),
@@ -153,36 +201,8 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){
   # 
   # # # row bind data frame back together
   # occ= data.frame(do.call(rbind, occ_split))
-    
-  save(new_occ_VR, file='auditfinal_VR_10292018.rData')
-  
-  
-
-  
- 
-  
-  #  get the date only from the timestamp
-  new_occ$ymd <- format(as.POSIXct(new_occ$tStamp),"%Y-%m-%d")
-  
-  # Rename one of  the clinics so it doesn't have an underscore.  
-  new_occ$Clinic = gsub('HH_POB','HHPOB',new_occ$Clinic)
-  
-  # make new columns for clinic + day and events
-  new_occ = unite(new_occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
-  
-  
 
-  # save this intermediate result for later use
-  save(new_occ, file='auditfinal_10022018.rData')
-  
 
-  #  RUN THESE TO BOIL/SAVE THE COMPLEXITY DATA
-  clinic_days = ACHR_batch_clinic_days(new_occ, TN, EVENT_CF)
-  save(clinic_days, file='ACHRV3_clinic_days.rData')
-  
-  visits = ACHR_batch_visits(new_occ, TN, EVENT_CF)
-  save(visits, file='ACHRV3_visits_with_timestamps.rData')
-  
 }
   
 

From da487911dac226b84120a225f7258831b99adc48 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Sun, 18 Nov 2018 22:05:02 -0500
Subject: [PATCH 06/31] Making progress

---
 .gitignore                        |   1 +
 Data Prep/.gitignore              |   1 +
 Data Prep/ACHR_Batch_PreProcess.R | 448 +++---------------------------
 Data Prep/ACHR_Batch_V2.R         |   8 +-
 4 files changed, 52 insertions(+), 406 deletions(-)
 create mode 100644 Data Prep/.gitignore

diff --git a/.gitignore b/.gitignore
index dd0b92d..62fdb39 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ vignettes/*.pdf
 rsconnect/
 >>>>>>> be0331537859f5addf650269855aa0a9d49ca5d5
 .Rproj.user
+emrt+Visit_ID+Role_Action.Rdata
diff --git a/Data Prep/.gitignore b/Data Prep/.gitignore
new file mode 100644
index 0000000..8ccf702
--- /dev/null
+++ b/Data Prep/.gitignore	
@@ -0,0 +1 @@
+ACHR_Batch_Aggregate_Data.R
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index 4f99de3..ef43e23 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -7,12 +7,13 @@
 # GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
 # Brian Pentland
 # Absolutely no warranty!
-##########################################################################################################
-
-
-# Separate out the two basicfunctions
+#
+# Separate out the two basic functions
 # 1) reading/cleaning the occurrences
 # 2) Threading the data by  adding ThreadNum and  SeqNum to  the  threads
+#
+##########################################################################################################
+
 
 redo_ACHR_data_from_scratch  <- function(fname){
 
@@ -31,11 +32,12 @@ redo_ACHR_data_from_scratch  <- function(fname){
   
   # Aggregate by visit for each POV
   visits = ACHR_batch_visits(occ, THREAD_CF, EVENT_CF, fname)
-  
+    
 }
 
 # this function reads the raw data from URMC
 # It returns the  data  frame  and  also saves it as an Rdata file with the  same name as the csv
+# Tested Nov 16.
 read_ACHR_data <- function(fname){
 
   library(tidyr)
@@ -44,29 +46,44 @@ read_ACHR_data <- function(fname){
   library(ThreadNet)
   library(ngram)
   library(lubridate)
-  library(expss)
+  library(anytime)
   
   
   # This code is tailored for reading in new data from URMC, October 2018
   # Assumes there  is a column  called 'Timestamps" in column 13 and  "Visit_ID"
   # read the file into data frame
-  d <- fread( paste0(fname, '.csv') )
+   d <-   fread( paste0(fname, '.csv') )
   
   # Sort by visit and timestamp
-  d  <- arrange(d,desc(Visit_ID,asc(Timestamps)))
+   d  <- arrange(d,desc(Visit_ID,asc(Timestamps)))
   
+  ##################################################################################
+  # This  stuff  is URMC specific 
+  #
   # move and rename the columns, as needed
   # Mainly need  to get the  Timestamps into the first column and rename them tStamp
   cn=colnames(d)
   cnx = cn[c(13,1:12,14:15)]
   d=setcolorder(d,cnx)
   setnames(d,'Timestamps','tStamp')
-  setnames(d,'X','seqn')
-  
+  setnames(d,'V1','seqn')
+
   # This converts numbers to char and replaces spaces with underscore
    d = cleanOccBatch(d)
   
-  save(d, file=paste0(fname, '.rData'))
+    ### This is URMC specific: Add the clinic and clinic_day  and fix the code for the highland  clinic
+   
+   # get the date only from the timestamp
+    d$ymd <- format(  as.POSIXct(d$tStamp, format="%Y-%m-%d"),  "%Y-%m-%d" ) 
+   
+   # make new columns for clinic + day and events
+   d = unite(d, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
+   
+   # Fix the name of the Highland  clinic
+   d$Clinic = gsub('HH_POB','HHPOB',d$Clinic)
+   
+  # Save the result   
+  save(d, file=paste0(fname, '.rData')) 
   
   return(d)
   
@@ -81,17 +98,15 @@ cleanOccBatch <- function(fileRows){
   
   # First  get  rid of any underscores pre-existing in the data, then
   # confirm all spaces are converted to underscores in non tStamp columns; set as factors
-  cleanedCF <- data.frame(lapply(fileRows[2:ncol(fileRows)], function(x){ gsub("_","",x)
-                                                                          gsub(" ","_",x)}))
+  cleanedCF <- data.frame(lapply(fileRows[2:ncol(fileRows)], function(x){  gsub(" ","_",x)  }) )
   
   # bind tStamp back to cleaned data
   complete <- cbind(tStamp,cleanedCF)
   
-  
   # Old code  for forcing tStamp into a "YMD_HMS" format -- not  needed  for  URMC  EMR  data
-  # complete$tStamp <- as.character(complete$tStamp)
-  # complete$tStamp <- parse_date_time(complete$tStamp, c("dmy HMS", "dmY HMS", "ymd HMS","dmy HM", "dmY HM", "ymd HM"))
-  
+   # complete$tStamp <- as.character(complete$tStamp)
+   # complete$tStamp <- parse_date_time(complete$tStamp, c("dmy HMS", "dmY HMS", "ymd HMS","dmy HM", "dmY HM", "ymd HM"))
+   
   # add weekday and month
   # complete$weekday <- as.factor(weekdays(as.Date(complete$tStamp)))
   # complete$month   <- as.factor(months(as.Date(complete$tStamp)))
@@ -100,47 +115,34 @@ cleanOccBatch <- function(fileRows){
 }
   
 #########################################################################################
-# This function adds columns  for  the the thread, as requested
+# This function adds columns  for  the the thread, as requested ( Tested Nov 16 )
 # Then it sorts by thread and tStamp and  adds thread/sequence numbers to  each thread.
 # 
 # Typical column names from the UMRC file
 # ALL_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group')
 # thread_CF = c('Visit_ID', 'Role', 'Workstation')
 # event_CF = c('Action','Role','Workstation')
-# TN = 'Visit_ID'
+
 # It returns  the threaded set of occurrences  and also saves it as  Rdata. 
-thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){
+thread_occurrences <- function(occ, THREAD_CF, fname='emr'){
   
   print(paste('Number of occurrences: ', nrow(occ)))
   
   # these will be  new column names
   new_thread_col = newColName(THREAD_CF)
-  new_event_col = newColName(EVENT_CF)
+ # new_event_col = newColName(EVENT_CF)
   
   print(paste('new_thread_col: ',new_thread_col))
-  print(paste('new_event_col: ', new_event_col))
+ # print(paste('new_event_col: ', new_event_col))
   
   # Add names for the new columns, if necessary
   if  (!(new_thread_col  %in% colnames(occ))) {  occ = combineContextFactors(occ,THREAD_CF,new_thread_col) }
-  if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CF,new_event_col) }
+# if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CF,new_event_col) }
   
   # ThreadNet code assumes these columns will be there, so we need to add them
-  new_occ_VR$threadNum = integer(nrow(new_occ_VR))
-  new_occ_VR$seqNum =   integer(nrow(new_occ_VR))
+  occ$threadNum = integer(nrow(occ))
+  occ$seqNum =   integer(nrow(occ))
   
-  # Special case for  Visit_ID, which is already filled in URMC  EMR data
-  if  (THREAD_CF == 'VISIT_ID') {
-    occ$threadNum = occ$Visit_ID
-    occ$seqNum = occ$seqn
-    
-    #  say how many there  are... 
-    print(paste('Number of threads (visits) in this POV: ', length(unique( occ$Visit_ID ))))
-    
-    #  Now  sort the  data  set by the new  threadNum and tStamp
-    occ = occ[order(occ[[new_thread_col]],occ$tStamp),]
-    
-  } 
-  else {
   
   #  Now  sort the  data  set by the new  threadNum and tStamp
   occ = occ[order(occ[[new_thread_col]],occ$tStamp),]
@@ -175,23 +177,16 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){
       
     }
   }
- }
-  
-  ### This is URMC specific:   For all POV combinations, add the clinic and clinic_day
-  
-  #  get the date only from the timestamp
-  occ$ymd <- format(as.POSIXct(new_occ$tStamp),"%Y-%m-%d")
-  
-  # make new columns for clinic + day and events
-   occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
   
-    
    # Now save the result for  later and return it, as well. 
-  save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata'))
+#  save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata'))
+  save(occ, file=paste0(paste(fname,new_thread_col,sep='+'), '.Rdata'))
+  
   print(paste('Saved threaded  occurrences: ', nrow(occ)))
   
   return(occ)
-  
+}
+
 
   ############   extra  stuff not used  #############
   # # split occ data frame by threadNum to find earliest time value for that thread
@@ -203,360 +198,7 @@ thread_occurrences <- function(occ,THREAD_CF,EVENT_CF, fname='emr'){
   # occ= data.frame(do.call(rbind, occ_split))
 
 
-}
-  
-
-  
-
-###################################################################
-###################################################################
-ACHR_batch_clinic_days <- function(occ,TN, CFs) {
-  
-  # Name for column that has events -- three variations
-  DV1= newColName(CFs[1])
-  DV2= newColName(CFs[1:2])
-  DV3= newColName(CFs[1:3])
-  
-  print(DV1)
-  print(DV2)
-  print(DV3)
-  
-  # name for column that defines the buckets
-  BU = 'Clinic_ymd'
-  
-    # first  get the date only 
-#    occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d")
-    
-    # make new columns as needed for clinic + day and events
-#    occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
-
-    
-
-  # pick subsets -- one visit at a time in this version, but could be more
-  bucket_list <- unique(occ[['Clinic_ymd']])
-  
-  # get the size (number of buckets)
-  N = length(bucket_list)
-  print(N)
-  
-  # pre-allocate the data.table.  Tables are supposed to be faster.
-  ACHR = data.table(bucket=integer(N),
-                    Clinic_date =  character(N),
-                    YMD_date = character(N),
-                    NEvents = integer(N),
-                    NumVisits = numeric(N),
-                    A_NetComplexity=double(N),
-                    AR_NetComplexity=double(N),
-                    ARW_NetComplexity=double(N),
-                    A_Nodes=double(N),
-                    AR_Nodes=double(N),
-                    ARW_Nodes=double(N),
-                    A_Edges=double(N),
-                    AR_Edges=double(N),
-                    ARW_Edges=double(N),
-                    A_CompressRatio = double(N),
-                    AR_CompressRatio = double(N),
-                    ARW_CompressRatio = double(N),
-                    A_Entropy = double(N),
-                    AR_Entropy = double(N),
-                    ARW_Entropy = double(N),
-                    Clinic = character(N),
-                    NumUniqueProcedures = numeric(N),
-                    NumUniqueDiagnosisGroups  = numeric(N), 
-                    NumPhysicians  = numeric(N), 
-                    Weekday  = character(N), 
-                    Month  = character(N)
-  )
-  
-  # Now add columns for the IVs.  There will be three for each IV
-  
-  # Add the IV columns
-  for (cf in CFs){
-    
-    ACHR[, paste0(cf,"_count"):= double(N)]
-    ACHR[, paste0(cf,"_compression"):= double(N)]
-    ACHR[, paste0(cf,"_entropy"):= double(N)]
-    
-  }
-  
-  # loop through the buckets. Result will be data frame with one row per bucket
-  for (i in 1:N){
-    
-    b = i #  as.integer(bucket_list[i])
-    
-    # print once every 10 buckets
-    if (b%%10==0) {print(b)}
-    
-    # select the threads that go in this bucket
-    df = occ[occ[[BU]] ==bucket_list[i],]
-    
-    # bucket number
-    ACHR[b,bucket := b]
-    
-    # length of the thread (number of rows)
-    ACHR[b,NEvents := nrow(df)]
-    
-    # only do the computations if there are more than two occurrences
-    if (nrow(df) > 2) {
-      
-      # compute the duration of the visit in hours
- #     ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
-      
-      # compressibility of DV
-      ACHR[b,A_CompressRatio := compression_index(df,DV1)]
-      ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
-      ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
-      
-      # NetComplexity of DV
-      # First get the network
-      n = threads_to_network_original(df,TN, DV1)
-      ACHR[b,A_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,A_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,A_Edges := nrow(n$edgeDF) ]
-      
-      n = threads_to_network_original(df,TN, DV2)
-      ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
-      
-      n = threads_to_network_original(df,TN, DV3)
-      ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
-      
-      # get the entropy for AR and ARW
-      ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
-      ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
-      
-      
-      # compute stuff on each context factor
-      for (cf in CFs){
-        
-        # Count the unique elements in each cf
-        ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ]
-        
-        # get the compression
-        ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
-        
-        # get the entropy
-        ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
-        
-      }
-    } # kf nrows > 2
-    
-    # Now copy in the rest of data  
-    # this works because one visit is one bucket
-    # count the number of different diagnoses --> typical indicators of complexity
-    
-    
-    ACHR[b,'Clinic_date'  := df[1,'Clinic_ymd']]
-    ACHR[b,'YMD_date'  := df[1,'ymd']]
-    ACHR[b,'NumVisits'  :=  length(unique(df[['Visit_ID']]))]
-    ACHR[b,'Clinic' := df[1,'Clinic']]
-    ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))]
-    ACHR[b,'NumUniqueDiagnosisGroups'  := length(unique(df[['Diagnosis_Group']]))]  
-    ACHR[b,'NumPhysicians'  := length(unique(df[['Physician']]))]  
-    ACHR[b,'Weekday'  := df[1,'Weekday']]
-    ACHR[b,'Month'  := df[1,'Month']]
-    
-    
-    
-  } # loop thru buckets
-  
-  # copy this one for consistency
-  ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ]
-  
-  # return the table
-  return(ACHR)
-}
-
-###################################################################
-###################################################################
-
-
 
-# ACHR stands for Antecedents of Complexity in Healthcare Routines.  
-# This is function is set up to compute process parameters on thousands of patient visits.
-ACHR_batch_visits <- function(occ,TN, CFs) {
-
-  DV1= newColName(CFs[1])
-  DV2= newColName(CFs[1:2])
-  DV3= newColName(CFs[1:3])
   
-  print(DV1)
-  print(DV2)
-  print(DV3)
-
-# pick subsets -- one visit at a time in this version, but could be more
-bucket_list <- make_buckets_1(occ, 'Visit_ID')
-
-# get the size (number of buckets)
-N = length(bucket_list)
-print(bucket_list)
-
-
-# pre-allocate the data.table.  Tables are supposed to be faster.
-ACHR = data.table(bucket=integer(N),
-                  NEvents = integer(N),
-                  VisitStart= character(N),
-                  VisitStartInt = integer(N),
-                  VisitDuration=double(N),   
-                  A_NetComplexity=double(N),
-                  AR_NetComplexity=double(N),
-                  ARW_NetComplexity=double(N),
-                  A_Nodes=double(N),
-                  AR_Nodes=double(N),
-                  ARW_Nodes=double(N),
-                  A_Edges=double(N),
-                  AR_Edges=double(N),
-                  ARW_Edges=double(N),
-                  A_CompressRatio = double(N),
-                  AR_CompressRatio = double(N),
-                  ARW_CompressRatio = double(N),
-                  A_Entropy = double(N),
-                  AR_Entropy = double(N),
-                  ARW_Entropy = double(N),
-                  Visit_ID  = character(N), 
-                  Subject_ID  = character(N), 
-                  Clinic = character(N),
-                  Procedure = character(N),
-                  Diagnosis = character(N),   # diag in the raw data
-                  Diagnosis_group  = character(N), 
-                  Physician  = character(N), 
-                  Weekday  = character(N), 
-                  Month  = character(N)
-                  )
-
-# Now add columns for the IVs.  There will be three for each IV
-
-# Add the IV columns
-for (cf in CFs){
-
-  ACHR[, paste0(cf,"_count"):= double(N)]
-  ACHR[, paste0(cf,"_compression"):= double(N)]
-  ACHR[, paste0(cf,"_entropy"):= double(N)]
 
-}
-
-# loop through the buckets. Result will be data frame with one row per bucket
-for (i in 1:N){
-
-  b = i #  as.integer(bucket_list[i])
-
-  # print once every 100 visits
-  if (b%%100==0) {print(b)}
   
-  # select the threads that go in this bucket
-    df = occ[occ[[TN]] ==bucket_list[i],]
-
-    # bucket number
-    ACHR[b,bucket := b]
-
-    # length of the thread (number of rows)
-    ACHR[b,NEvents := nrow(df)]
-
-    # only do the computations if there are more than two occurrences
-    if (nrow(df) > 2) {
-
-      # compute the duration of the visit in hours
-      ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
-     
-    # compressibility of DV
-      ACHR[b,A_CompressRatio := compression_index(df,DV1)]
-      ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
-      ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
-      
-    # NetComplexity of DV
-    # First get the network
-      # NetComplexity of DV
-      # First get the network
-      n = threads_to_network_original(df,TN, DV1)
-      ACHR[b,A_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,A_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,A_Edges := nrow(n$edgeDF) ]
-      
-      n = threads_to_network_original(df,TN, DV2)
-      ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
-      
-      n = threads_to_network_original(df,TN, DV3)
-      ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
-      
-      # get the entropy for AR and ARW
-      ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
-      ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
-      
-    
-  # compute stuff on each context factor
-  for (cf in CFs){
-
-    # Count the unique elements in each cf
-    ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ]
-
-    # get the compression
-    ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
-
-    # get the entropy
-    ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
-
-  }
-} # kf nrows > 2
-
-    # Now copy in the rest of data  
-    # this only works because one visit is one bucket
-    
- #   ACHR[b,'VisitStart'  := as.POSIXct( df[1,'tStamp']) ] 
-    ACHR[b,'VisitStartInt'  :=   df[1,'tStamp'] ] 
-    
-    ACHR[b,'VisitStart'  :=   as.character( df[1,'tStamp'] ) ] 
-    
-    ACHR[b,'Visit_ID'  := df[1,'Visit_ID']]
-    ACHR[b,'Subject_ID'  := df[1,'Subject_ID']]
-    ACHR[b,'Clinic' := df[1,'Clinic']]
-    ACHR[b,'Procedure' := df[1,'Proc']]
-    ACHR[b,'Diagnosis' := df[1,'Diag']]
-    ACHR[b,'Diagnosis_group'  := df[1,'Diagnosis_Group']]
-    ACHR[b,'Physician'  := df[1,'Physician']]
-    ACHR[b,'Weekday'  := df[1,'Weekday']]
-    ACHR[b,'Month'  := df[1,'Month']]
-    
-    
-    
-} # loop thru buckets
-
-# copy this one for consistency
-ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ]
-
-# return the table
-return(ACHR)
-}
-
-
-# Each bucket is a list of thread numbers that can be used to subset the list of occurrences
-make_buckets_1 <- function(o, criteria){
-
-  return( unique(o[[criteria]]) )
-
-}
-
-
-
-
-
-# 
-# make_box_plots <- function(){
-# ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity",
-#           color = "VisitDay",
-#           ylab = "Complexity", xlab = "Month (DRH)")
-# }
-# 
-
-
-
-
-
-get_timeScale <- function(){'hr'}
-
diff --git a/Data Prep/ACHR_Batch_V2.R b/Data Prep/ACHR_Batch_V2.R
index 730f0b5..818ef08 100644
--- a/Data Prep/ACHR_Batch_V2.R	
+++ b/Data Prep/ACHR_Batch_V2.R	
@@ -35,7 +35,7 @@
 
 # this function reads the raw data from URMC and creates files for clinic_day and visits
 
-read_ACHR_data <- function(){
+read_ACHR_data_old <- function(){
 
   library(tidyr)
   library(data.table)
@@ -48,7 +48,9 @@ read_ACHR_data <- function(){
   
   # This code is tailored for reading in new data from URMC, October 2018
   # read the file into data frame
-  d <<- fread('auditfinal_10022018.csv')
+   d <<- fread('auditfinal_10022018.csv')
+  
+ # d <<- fread('10test.csv')
   
   # Sort and convert to data.table
   vt <<-data.table(arrange(d,desc(Visit_ID,asc(Timestamps))))
@@ -58,7 +60,7 @@ read_ACHR_data <- function(){
   cnx = cn[c(13,1:12,14:15)]
   vt=setcolorder(vt,cnx)
   setnames(vt,'Timestamps','tStamp')
-  setnames(vt,'X','seqn')
+ # setnames(vt,'X','seqn')
   
   # make a dataframe copy just for fun
   vdf <<- as.data.frame(vt)

From d1f3c1e13e67d2db370c2fe120dbea5ccf80e490 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Tue, 20 Nov 2018 12:45:14 -0500
Subject: [PATCH 07/31] Converted to sapply for main loop

---
 .gitignore                        |  4 ++++
 Data Prep/ACHR_Batch_PreProcess.R | 14 +++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 62fdb39..265a87a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,7 @@ rsconnect/
 >>>>>>> be0331537859f5addf650269855aa0a9d49ca5d5
 .Rproj.user
 emrt+Visit_ID+Role_Action.Rdata
+Thrds+threadNum+Action.Rdata
+Threads+threadNum+Action.Rdata
+Threads+threadNum+Action_Role.Rdata
+emrt+Visit_ID+Role_Action_Workstation.Rdata
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index ef43e23..ffa0a38 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -189,13 +189,13 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){
 
 
   ############   extra  stuff not used  #############
-  # # split occ data frame by threadNum to find earliest time value for that thread
-  # # then substract that from initiated relativeTime from above
-  # occ_split = lapply(split(occ, occ$threadNum),
-  #                    function(x) {x$relativeTime = difftime(x$relativeTime,  min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x})
-  # 
-  # # # row bind data frame back together
-  # occ= data.frame(do.call(rbind, occ_split))
+  # split occ data frame by threadNum to find earliest time value for that thread
+  # then substract that from initiated relativeTime from above
+  occ_split = lapply(split(occ, occ$threadNum),
+                     function(x) {x$relativeTime = difftime(x$relativeTime,  min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x})
+
+  # # row bind data frame back together
+  occ= data.frame(do.call(rbind, occ_split))
 
 
 

From 84d677a8cb83de686fe87a4f307dfd8190d9af52 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 22 Nov 2018 19:34:41 -0500
Subject: [PATCH 08/31] Added comments on work needed to get correct visit-role
 data

---
 .gitignore                        |  2 ++
 Data Prep/ACHR_Batch_PreProcess.R | 18 +++++++++++-------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 265a87a..7cc7327 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,5 @@ Thrds+threadNum+Action.Rdata
 Threads+threadNum+Action.Rdata
 Threads+threadNum+Action_Role.Rdata
 emrt+Visit_ID+Role_Action_Workstation.Rdata
+emr+Visit_ID.Rdata
+Thrds+threadNum+Action_Role_Workstation.Rdata
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index ffa0a38..3168d8b 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -143,10 +143,14 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){
   occ$threadNum = integer(nrow(occ))
   occ$seqNum =   integer(nrow(occ))
   
-  
+  #_#_#_#_#_#_ I think what we need to do to get the correct visit_role is sort by  visit and time stamp
+  #_#_#_#_#_#_ But then SPLIT the visit according to visit_role or visit_workstation or whatever
   #  Now  sort the  data  set by the new  threadNum and tStamp
   occ = occ[order(occ[[new_thread_col]],occ$tStamp),]
   
+  
+  #_#_#_#_#_#_ This will be incorrect if  the same role is occurs more than once in a single visit. 
+  #_#_#_#_#_#_ Need to break  it out according to the changes in role within a visit...  
   # get the list of unique identifies for the threads. The length of this list is the number of threads
   pov_list = unique(occ[[new_thread_col]])
   print(paste('Number of threads in this POV: ', length(pov_list)))
@@ -190,12 +194,12 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){
 
   ############   extra  stuff not used  #############
   # split occ data frame by threadNum to find earliest time value for that thread
-  # then substract that from initiated relativeTime from above
-  occ_split = lapply(split(occ, occ$threadNum),
-                     function(x) {x$relativeTime = difftime(x$relativeTime,  min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x})
-
-  # # row bind data frame back together
-  occ= data.frame(do.call(rbind, occ_split))
+  # # then substract that from initiated relativeTime from above
+  # occ_split = lapply(split(occ, occ$threadNum),
+  #                    function(x) {x$relativeTime = difftime(x$relativeTime,  min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x})
+  # 
+  # # # row bind data frame back together
+  # occ= data.frame(do.call(rbind, occ_split))
 
 
 

From 21a5ffd06f4ce71d7558c9d137f624202de89bca Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 22 Nov 2018 19:37:30 -0500
Subject: [PATCH 09/31] Added new functions for aggregating data

---
 Data Prep/.gitignore                  |   1 -
 Data Prep/ACHR_Batch_Aggregate_Data.R | 495 ++++++++++++++++++++++++++
 2 files changed, 495 insertions(+), 1 deletion(-)
 create mode 100644 Data Prep/ACHR_Batch_Aggregate_Data.R

diff --git a/Data Prep/.gitignore b/Data Prep/.gitignore
index 8ccf702..e69de29 100644
--- a/Data Prep/.gitignore	
+++ b/Data Prep/.gitignore	
@@ -1 +0,0 @@
-ACHR_Batch_Aggregate_Data.R
diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
new file mode 100644
index 0000000..a8ff292
--- /dev/null
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -0,0 +1,495 @@
+##########################################################################################################
+# THREADNET:  Batch processing for larger data sets
+#
+#  ACHR_Batch_Aggregate_Data
+#
+# (c) 2018 Michigan State University. This software may be used according to the terms provided in the
+# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
+# Brian Pentland
+# Absolutely no warranty!
+##########################################################################################################
+
+
+# Two  ways to aggregate the occurrences 
+# 1) By thread (usually visit)
+# 2) By Clinic_day
+  
+
+
+
+###################################################################
+###################################################################
+# This is function is set up to aggregate the occurrences within threads -- typically visits
+# occ = pre-processed threaded occurrences
+# TN = threadNum in most cases
+# CFs can be chosen -- they don't have to match the POV
+
+ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
+  
+  library(tidyr)
+  library(dplyr)
+  library(ThreadNet)
+  library(ngram)
+  library(lubridate)
+  library(stringr)
+  
+  # don't remember why  this is here...
+  new_event_col = newColName(EVENT_CFs)
+  if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+  
+  # get the list of buckets
+  bucket_list <- make_buckets_1(occ, TN)
+  
+  # make data frame with results
+ Thrds = data.frame( t(sapply( bucket_list, 
+                           function(b){
+                             
+                             # select a subset 
+                             df = occ[ occ[[TN]] ==b , ] 
+                             
+                            # Only run for visits with more than two occurrences
+                        
+                             # get the network
+                             n = threads_to_network_original(df,TN, new_event_col)
+                             
+                             # compute each parameter and put them in a vector
+                             c(
+                               bucket=b,
+                               NEvents = nrow(df),
+                               ThreadStart= as.character( df[1,'tStamp'] ),
+                               ThreadStartInt = df[1,'tStamp'],
+                               ThreadDuration= difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours' ),
+                               NetComplexity=estimate_network_complexity( n ),
+                               Nodes=nrow(n$nodeDF),
+                               Edges=nrow(n$edgeDF),
+                               CompressRatio = compression_index(df,new_event_col),
+                               Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]),
+                               NumProcedures = count_procedures(df$Proc[1]),
+                               NumDiagnoses = count_diagnoses(df$Diag[1]),
+                               wait_time = compute_wait_time(df),
+                             #   Visit_number = integer(N),
+                             #   LOC_CPT =  character(N),
+                                Visit_ID  = df[1,'Visit_ID'],
+                                Subject_ID  = df[1,'Subject_ID'],
+                                Clinic = df[1,'Clinic'],
+                                ymd = df[1,'ymd'],
+                                Clinic_ymd = df[1,'Clinic_ymd'],
+                                Proc = df[1,'Proc'],
+                                Diagnosis = df[1,'Diag'],   # diag in the raw data
+                                Diagnosis_group  = df[1,'Diagnosis_Group'],
+                                Physician  = df[1,'Physician'],
+                                Weekday  = df[1,'Weekday'],
+                                Month  = df[1,'Month'],
+                                Phase =  compute_phase(df$tStamp[1]),
+                                CF_Alignment = compute_alignment(df,TN, EVENT_CFs, ALL_CFs ),
+                             
+                             # need to figure out how to name these
+                             sapply(ALL_CFs, function(cf){
+                               c(
+                                 assign( paste0(cf,"_count"), length(unique(df[[cf]]))),
+                                 assign( paste0(cf,"_entropy"), compute_entropy(table(df[[cf]])[table(df[[cf]])>0])) 
+                               ) })
+                             
+                             )
+                             
+  } )))
+ 
+ save(Thrds, file=paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata'))
+ 
+ return(Thrds) 
+  
+}
+
+# Each bucket is a list of thread numbers that can be used to subset the list of occurrences
+make_buckets_1 <- function(o, criteria){
+  
+  return( unique(o[[criteria]]) )
+  
+}
+
+# Need to write all  these functions
+compute_phase <- function(t){
+  
+  t=as.Date(t)
+  
+  if ( t >= '2016-01-01' & t < '2016-06-07')  return('one')
+  if ( t >= '2016-06-07' & t < '2016-09-01')  return('two')
+  if ( t >= '2016-09-01' & t < '2017-04-15')  return('three')
+  if ( t >= '2017-04-15' & t < '2017-09-01')  return('four')
+  if ( t >= '2017-09-01' & t < '2017-12-31')  return('five')
+    
+    # else return NA
+    return(NA)
+
+}
+
+# alignment is a measure of how much the number of nodes is increased by adding context factors
+# always base this on the "Action" -- how many more "actions" are there? 
+# could also do this based on entropy, or write the function to use the values that were already computed
+compute_alignment <- function(df,TN, EVENT_CFs, ALL_CFs ){
+  
+  # first get numerator
+  n= length(unique(df$Action))
+  
+  # Now get the total number with all  of the CFs used to define the events
+  m= length(unique( df[[newColName(EVENT_CFs)]]))
+
+  return(n/m)
+}
+
+
+compute_wait_time <- function(df){
+  
+  #  find the first  occurrence of the chief complain.  Seems to correspond closely to wait time
+  #  Make sure it's not in the first 5 occurrences, and make sure to  return a  value of at least 1 if it never occurs
+  w= grep('MR_VN_CHIEF_COMPLAINT',df$Action)[1]
+  
+  return( difftime( lubridate::ymd_hms(df$tStamp[w]),lubridate::ymd_hms(df$tStamp[1]),  units='hours' ) )
+   
+}
+
+count_procedures <- function( p ){
+  
+  # get the overall number of items
+  total_num=length(grep('#@#', p ))
+  
+  # adjust for the visit codes that are not actual procedures
+  v=c('99211','99212','99213','99214','99215','99201','99202','99203','99204','99205')
+  
+  # just look at the first occurrence in the df
+  num_visit_codes  = sum(sapply(v,function(x){length(grep( x, p ))}))
+  
+  # limit  to non-negative 
+  total_num = max(0, total_num - num_visit_codes) 
+  
+  return( total_num  )
+}
+
+# This one  is easy
+count_diagnoses <- function(d){
+  return(length(grep('#@#', d )))
+}
+
+
+##  OLD   ###
+# ACHR_batch_threads_old <- function(occ,TN, EVENT_CFs, ALL_CFs) {
+# 
+#   library(tidyr)
+#   library(dplyr)
+#   library(ThreadNet)
+#   library(ngram)
+#   library(lubridate)
+#   library(stringr)
+#   
+#   new_event_col = newColName(EVENT_CFs)
+#   if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+#   
+# # pick subsets --here we aggregate one thread at a time  
+# bucket_list <- make_buckets_1(occ, TN)
+# 
+# # get the size (number of buckets)
+# N = length(bucket_list)
+# c
+# # pre-allocate the result
+# ACHR = data.table(bucket=integer(N),
+#                   NEvents = integer(N),
+#                   ThreadStart= character(N),
+#                   ThreadStartInt = integer(N),
+#                   ThreadDuration =double(N),   
+#                   NetComplexity=double(N),
+#                   Nodes=double(N),
+#                   Edges=double(N),
+#                   CompressRatio = double(N),
+#                   Entropy = double(N),
+#                   NumProcedures = double(N),
+#                   NumDiagnoses = double(N),
+#                   wait_time = double(N),
+#                   Visit_number = integer(N),
+#                   LOC_CPT =  character(N), 
+#                   Visit_ID  = character(N), 
+#                   Subject_ID  = character(N), 
+#                   Clinic = character(N),
+#                   ymd = character(N),
+#                   Clinic_ymd = character(N),
+#                   Procedure = character(N),
+#                   Diagnosis = character(N),   # diag in the raw data
+#                   Diagnosis_group  = character(N), 
+#                   Physician  = character(N), 
+#                   Weekday  = character(N), 
+#                   Month  = character(N),
+#                   Phase =  character(N),
+#                   CF_Alignment = double(N) 
+#                   )
+# 
+# # Now add columns for the CFs.  There will be two for each CF
+# for (cf in ALL_CFs){
+#   ACHR[, paste0(cf,"_count"):= double(N)]
+#   ACHR[, paste0(cf,"_entropy"):= double(N)]
+# }
+# 
+# # loop through the buckets. Result will be data frame with one row per bucket
+# for (i in 1:N){
+# 
+#   b = i #  as.integer(bucket_list[i])
+# 
+#   # print once every 100 visits
+#   if (b%%100==0) {print(b)}
+#   
+#   # select the threads that go in this bucket
+#     df = occ[occ[[TN]] ==bucket_list[i],]
+# 
+#     # bucket number
+#     ACHR[b,bucket := b]
+# 
+#     # length of the thread (number of rows)
+#     ACHR[b,NEvents := nrow(df)]
+# 
+#     # only do the computations if there are more than two occurrences
+#     if (nrow(df) > 2) {
+# 
+#       # compute the duration of the visit in hours
+#       ACHR[b,ThreadDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
+#      
+#     # compressibility of DV
+#       ACHR[b,CompressRatio := compression_index(df,new_event_col)]
+#       # ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
+#       # ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
+#       
+#     # NetComplexity of DV
+#     # First get the network
+#       # NetComplexity of DV
+#       # First get the network
+#       n = threads_to_network_original(df,TN, new_event_col)
+#       ACHR[b,NetComplexity := estimate_network_complexity( n )]
+#       ACHR[b,Nodes := nrow(n$nodeDF) ]
+#       ACHR[b,Edges := nrow(n$edgeDF) ]
+#       
+#     
+#   # compute stuff on each context factor
+#   for (cf in ALL_CFs){
+# 
+#     # Count the unique elements in each cf
+#     ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ]
+# 
+#     # get the entropy
+#     ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+# 
+#   }
+# } # kf nrows > 2
+# 
+#     # Now copy in the rest of data  
+#     # this only works because one visit is one bucket
+#     
+#  #   ACHR[b,'VisitStart'  := as.POSIXct( df[1,'tStamp']) ] 
+#     ACHR[b,'ThreadStartInt'  :=   df[1,'tStamp'] ] 
+#     
+#     ACHR[b,'ThreadStart'  :=   as.character( df[1,'tStamp'] ) ] 
+#     
+#     ACHR[b,'Visit_ID'  := df[1,'Visit_ID']]
+#     ACHR[b,'Subject_ID'  := df[1,'Subject_ID']]
+#     ACHR[b,'Clinic' := df[1,'Clinic']]
+#     ACHR[b,'Procedure' := df[1,'Proc']]
+#     ACHR[b,'Diagnosis' := df[1,'Diag']]
+#     ACHR[b,'Diagnosis_group'  := df[1,'Diagnosis_Group']]
+#     ACHR[b,'Physician'  := df[1,'Physician']]
+#     ACHR[b,'Weekday'  := df[1,'Weekday']]
+#     ACHR[b,'Month'  := df[1,'Month']]
+#     
+#     
+#     
+# } # loop thru buckets
+# 
+# 
+#   save(ACHR, file=paste0(paste('Threads',TN,new_event_col,sep='+'), '.Rdata'))
+# 
+# 
+# # return the table
+# return(ACHR)
+# }
+
+
+
+
+
+###################################################################
+###################################################################
+ACHR_batch_clinic_days <- function(occ,TN, EVENT_CFs, ALL_CFs) {
+  
+  # Name for column that has events -- three variations
+  DV1= newColName(CFs[1])
+  DV2= newColName(CFs[1:2])
+  DV3= newColName(CFs[1:3])
+  
+  print(DV1)
+  print(DV2)
+  print(DV3)
+  
+  # name for column that defines the buckets
+  BU = 'Clinic_ymd'
+  
+  # first  get the date only 
+  #    occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d")
+  
+  # make new columns as needed for clinic + day and events
+  #    occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
+  
+  
+  
+  # pick subsets -- one visit at a time in this version, but could be more
+  bucket_list <- unique(occ[['Clinic_ymd']])
+  
+  # get the size (number of buckets)
+  N = length(bucket_list)
+  print(N)
+  
+  # pre-allocate the data.table.  Tables are supposed to be faster.
+  ACHR = data.table(bucket=integer(N),
+                    Clinic_date =  character(N),
+                    YMD_date = character(N),
+                    NEvents = integer(N),
+                    NumVisits = numeric(N),
+                    A_NetComplexity=double(N),
+                    AR_NetComplexity=double(N),
+                    ARW_NetComplexity=double(N),
+                    A_Nodes=double(N),
+                    AR_Nodes=double(N),
+                    ARW_Nodes=double(N),
+                    A_Edges=double(N),
+                    AR_Edges=double(N),
+                    ARW_Edges=double(N),
+                    A_CompressRatio = double(N),
+                    AR_CompressRatio = double(N),
+                    ARW_CompressRatio = double(N),
+                    A_Entropy = double(N),
+                    AR_Entropy = double(N),
+                    ARW_Entropy = double(N),
+                    Clinic = character(N),
+                    NumUniqueProcedures = numeric(N),
+                    NumUniqueDiagnosisGroups  = numeric(N), 
+                    NumPhysicians  = numeric(N), 
+                    Weekday  = character(N), 
+                    Month  = character(N)
+  )
+  
+  # Now add columns for the IVs.  There will be three for each IV
+  
+  # Add the IV columns
+  for (cf in CFs){
+    
+    ACHR[, paste0(cf,"_count"):= double(N)]
+    ACHR[, paste0(cf,"_compression"):= double(N)]
+    ACHR[, paste0(cf,"_entropy"):= double(N)]
+    
+  }
+  
+  # loop through the buckets. Result will be data frame with one row per bucket
+  for (i in 1:N){
+    
+    b = i #  as.integer(bucket_list[i])
+    
+    # print once every 10 buckets
+    if (b%%10==0) {print(b)}
+    
+    # select the threads that go in this bucket
+    df = occ[occ[[BU]] ==bucket_list[i],]
+    
+    # bucket number
+    ACHR[b,bucket := b]
+    
+    # length of the thread (number of rows)
+    ACHR[b,NEvents := nrow(df)]
+    
+    # only do the computations if there are more than two occurrences
+    if (nrow(df) > 2) {
+      
+      # compute the duration of the visit in hours
+      #     ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
+      
+      # compressibility of DV
+      ACHR[b,A_CompressRatio := compression_index(df,DV1)]
+      ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
+      ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
+      
+      # NetComplexity of DV
+      # First get the network
+      n = threads_to_network_original(df,TN, DV1)
+      ACHR[b,A_NetComplexity := estimate_network_complexity( n )]
+      ACHR[b,A_Nodes := nrow(n$nodeDF) ]
+      ACHR[b,A_Edges := nrow(n$edgeDF) ]
+      
+      n = threads_to_network_original(df,TN, DV2)
+      ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
+      ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
+      ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
+      
+      n = threads_to_network_original(df,TN, DV3)
+      ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
+      ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
+      ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
+      
+      # get the entropy for AR and ARW
+      ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
+      ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
+      
+      
+      # compute stuff on each context factor
+      for (cf in CFs){
+        
+        # Count the unique elements in each cf
+        ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ]
+        
+        # get the compression
+        ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
+        
+        # get the entropy
+        ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+        
+      }
+    } # kf nrows > 2
+    
+    # Now copy in the rest of data  
+    # this works because one visit is one bucket
+    # count the number of different diagnoses --> typical indicators of complexity
+    
+    
+    ACHR[b,'Clinic_date'  := df[1,'Clinic_ymd']]
+    ACHR[b,'YMD_date'  := df[1,'ymd']]
+    ACHR[b,'NumVisits'  :=  length(unique(df[['Visit_ID']]))]
+    ACHR[b,'Clinic' := df[1,'Clinic']]
+    ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))]
+    ACHR[b,'NumUniqueDiagnosisGroups'  := length(unique(df[['Diagnosis_Group']]))]  
+    ACHR[b,'NumPhysicians'  := length(unique(df[['Physician']]))]  
+    ACHR[b,'Weekday'  := df[1,'Weekday']]
+    ACHR[b,'Month'  := df[1,'Month']]
+    
+    
+    
+  } # loop thru buckets
+  
+  # copy this one for consistency
+  ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ]
+  
+  # return the table
+  return(ACHR)
+}
+
+###################################################################
+###################################################################
+
+
+
+
+# 
+# make_box_plots <- function(){
+# ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity",
+#           color = "VisitDay",
+#           ylab = "Complexity", xlab = "Month (DRH)")
+# }
+# 
+
+
+
+
+
+get_timeScale <- function(){'hr'}
+

From 1db94381010a8a2a928a6c87258a44f727569b5b Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Fri, 23 Nov 2018 09:43:50 -0500
Subject: [PATCH 10/31] cleaning up the aggregation functions

---
 Data Prep/ACHR_Batch_Aggregate_Data.R | 86 +++++++++++++++++++--------
 1 file changed, 60 insertions(+), 26 deletions(-)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index a8ff292..cb13f50 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -24,6 +24,8 @@
 # TN = threadNum in most cases
 # CFs can be chosen -- they don't have to match the POV
 
+#  need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time.
+
 ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
   
   library(tidyr)
@@ -33,9 +35,12 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
   library(lubridate)
   library(stringr)
   
-  # don't remember why  this is here...
+  # Add  columns for combinations of CFs if needed
   new_event_col = newColName(EVENT_CFs)
   if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+ 
+  all_cf_col = newColName(ALL_CFs)
+  if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
   
   # get the list of buckets
   bucket_list <- make_buckets_1(occ, TN)
@@ -49,16 +54,31 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
                              
                             # Only run for visits with more than two occurrences
                         
-                             # get the network
-                             n = threads_to_network_original(df,TN, new_event_col)
+                             # get the network -- only if there are enough rows...
+                             
+                              if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+                              else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
                              
                              # compute each parameter and put them in a vector
                              c(
                                bucket=b,
+                               Clinic = df[1,'Clinic'],
+                               Physician  = as.character(df[1,'Physician']),
+                               ymd = df[1,'ymd'],
+                               Clinic_ymd = df[1,'Clinic_ymd'],
+                               Weekday  = df[1,'Weekday'],
+                               Month  = df[1,'Month'],
+                               Phase =  compute_phase(df$tStamp[1]),
+                               threadNum = df[1,'threadNum'],
+                               Visit_ID  = df[1,'Visit_ID'],
+                               Subject_ID  = df[1,'Subject_ID'],
                                NEvents = nrow(df),
                                ThreadStart= as.character( df[1,'tStamp'] ),
                                ThreadStartInt = df[1,'tStamp'],
                                ThreadDuration= difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours' ),
+                               wait_time = compute_wait_time(df),
+                               #   Visit_number = integer(N),
+                               #   LOC_CPT =  character(N),
                                NetComplexity=estimate_network_complexity( n ),
                                Nodes=nrow(n$nodeDF),
                                Edges=nrow(n$edgeDF),
@@ -66,34 +86,43 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
                                Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]),
                                NumProcedures = count_procedures(df$Proc[1]),
                                NumDiagnoses = count_diagnoses(df$Diag[1]),
-                               wait_time = compute_wait_time(df),
-                             #   Visit_number = integer(N),
-                             #   LOC_CPT =  character(N),
-                                Visit_ID  = df[1,'Visit_ID'],
-                                Subject_ID  = df[1,'Subject_ID'],
-                                Clinic = df[1,'Clinic'],
-                                ymd = df[1,'ymd'],
-                                Clinic_ymd = df[1,'Clinic_ymd'],
-                                Proc = df[1,'Proc'],
-                                Diagnosis = df[1,'Diag'],   # diag in the raw data
-                                Diagnosis_group  = df[1,'Diagnosis_Group'],
-                                Physician  = df[1,'Physician'],
-                                Weekday  = df[1,'Weekday'],
-                                Month  = df[1,'Month'],
-                                Phase =  compute_phase(df$tStamp[1]),
-                                CF_Alignment = compute_alignment(df,TN, EVENT_CFs, ALL_CFs ),
+                               Proc = as.character(df[1,'Proc']),
+                               Diagnosis = as.character(df[1,'Diag']),    
+                               Diagnosis_group  = df[1,'Diagnosis_Group'],
+                               CF_Alignment = 1,
+                              # CF_Alignment = compute_alignment(df,TN, EVENT_CFs, ALL_CFs ),
+                               ALL_CF_count = length(unique(df[[all_cf_col]])),
+                               ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), 
                              
-                             # need to figure out how to name these
+                             # name these afterwareds
                              sapply(ALL_CFs, function(cf){
                                c(
-                                 assign( paste0(cf,"_count"), length(unique(df[[cf]]))),
-                                 assign( paste0(cf,"_entropy"), compute_entropy(table(df[[cf]])[table(df[[cf]])>0])) 
+                                  length(unique(df[[cf]])),
+                                  compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) 
+                                 
+                                 # assign( paste0(cf,"_count"), length(unique(df[[cf]]))),
+                                 # assign( paste0(cf,"_entropy"), compute_entropy(table(df[[cf]])[table(df[[cf]])>0])) 
                                ) })
                              
                              )
                              
   } )))
  
+ 
+ # name the last columns -- code has to match  above
+ cn =  as.vector(sapply(ALL_CFs, function(cf){
+   c( paste0(cf,"_count"),  
+    paste0(cf,"_entropy") ) }))
+ 
+ # now assign them to the last columns
+ last_col = ncol(Thrds)
+ first_col= last_col-length(cn)+1
+ setnames(Thrds, c(first_col:last_col), cn)
+ 
+ # Computer the alignment of the context factors
+ Thrds$CF_Alignment =  as.numeric( Thrds$Action_count) / as.numeric( Thrds$ALL_CF_count )
+  
+ 
  save(Thrds, file=paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata'))
  
  return(Thrds) 
@@ -131,7 +160,7 @@ compute_alignment <- function(df,TN, EVENT_CFs, ALL_CFs ){
   # first get numerator
   n= length(unique(df$Action))
   
-  # Now get the total number with all  of the CFs used to define the events
+  # Now get the total number with all  of the CFs  
   m= length(unique( df[[newColName(EVENT_CFs)]]))
 
   return(n/m)
@@ -150,14 +179,17 @@ compute_wait_time <- function(df){
 
 count_procedures <- function( p ){
   
+  # if no procedures, return zero
+  if (is.na( p )) return( 0 )
+  
   # get the overall number of items
-  total_num=length(grep('#@#', p ))
+  total_num = str_count( p, '#@#')
   
   # adjust for the visit codes that are not actual procedures
   v=c('99211','99212','99213','99214','99215','99201','99202','99203','99204','99205')
   
   # just look at the first occurrence in the df
-  num_visit_codes  = sum(sapply(v,function(x){length(grep( x, p ))}))
+  num_visit_codes  = sum(str_count( p,  v ))
   
   # limit  to non-negative 
   total_num = max(0, total_num - num_visit_codes) 
@@ -167,7 +199,9 @@ count_procedures <- function( p ){
 
 # This one  is easy
 count_diagnoses <- function(d){
-  return(length(grep('#@#', d )))
+  
+  # Sometimes the marker is missing, so set the floor to one.  There is always at least one. 
+  return( max(1, str_count( d, '#@#' )))
 }
 
 

From 7dc9bbf2323552edb99efb389bf08976efac0351 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Fri, 23 Nov 2018 11:44:26 -0500
Subject: [PATCH 11/31] Fixed bug in alignment

---
 Data Prep/ACHR_Batch_Aggregate_Data.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index cb13f50..fb6a81a 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -34,6 +34,7 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
   library(ngram)
   library(lubridate)
   library(stringr)
+  library(data.table)
   
   # Add  columns for combinations of CFs if needed
   new_event_col = newColName(EVENT_CFs)
@@ -119,8 +120,8 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
  first_col= last_col-length(cn)+1
  setnames(Thrds, c(first_col:last_col), cn)
  
- # Computer the alignment of the context factors
- Thrds$CF_Alignment =  as.numeric( Thrds$Action_count) / as.numeric( Thrds$ALL_CF_count )
+ # Compute the alignment of the context factors
+ Thrds$CF_Alignment =  as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count ))
   
  
  save(Thrds, file=paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata'))

From 5d544abe7f16c844b664309bb44364a0311327fd Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Sat, 24 Nov 2018 04:01:57 -0500
Subject: [PATCH 12/31] Fixed bug n Visit_role -- and it's faster

---
 Data Prep/ACHR_Batch_Aggregate_Data.R | 20 +++------
 Data Prep/ACHR_Batch_PreProcess.R     | 60 ++++++++-------------------
 2 files changed, 24 insertions(+), 56 deletions(-)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index fb6a81a..4b69919 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -90,27 +90,20 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
                                Proc = as.character(df[1,'Proc']),
                                Diagnosis = as.character(df[1,'Diag']),    
                                Diagnosis_group  = df[1,'Diagnosis_Group'],
-                               CF_Alignment = 1,
-                              # CF_Alignment = compute_alignment(df,TN, EVENT_CFs, ALL_CFs ),
+                               CF_Alignment = 1,    # make placeholder, but compute below
                                ALL_CF_count = length(unique(df[[all_cf_col]])),
                                ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), 
                              
-                             # name these afterwareds
+                             # name these columns afterwards
                              sapply(ALL_CFs, function(cf){
-                               c(
-                                  length(unique(df[[cf]])),
-                                  compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) 
-                                 
-                                 # assign( paste0(cf,"_count"), length(unique(df[[cf]]))),
-                                 # assign( paste0(cf,"_entropy"), compute_entropy(table(df[[cf]])[table(df[[cf]])>0])) 
-                               ) })
+                               c( length(unique(df[[cf]])),
+                                  compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ) })
                              
                              )
-                             
-  } )))
+                      } )))
  
  
- # name the last columns -- code has to match  above
+ # name the last columns -- code has to match above
  cn =  as.vector(sapply(ALL_CFs, function(cf){
    c( paste0(cf,"_count"),  
     paste0(cf,"_entropy") ) }))
@@ -122,7 +115,6 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
  
  # Compute the alignment of the context factors
  Thrds$CF_Alignment =  as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count ))
-  
  
  save(Thrds, file=paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata'))
  
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index 3168d8b..ded690e 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -54,7 +54,7 @@ read_ACHR_data <- function(fname){
   # read the file into data frame
    d <-   fread( paste0(fname, '.csv') )
   
-  # Sort by visit and timestamp
+  # Sort by visit and timestamps
    d  <- arrange(d,desc(Visit_ID,asc(Timestamps)))
   
   ##################################################################################
@@ -122,7 +122,6 @@ cleanOccBatch <- function(fileRows){
 # ALL_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group')
 # thread_CF = c('Visit_ID', 'Role', 'Workstation')
 # event_CF = c('Action','Role','Workstation')
-
 # It returns  the threaded set of occurrences  and also saves it as  Rdata. 
 thread_occurrences <- function(occ, THREAD_CF, fname='emr'){
   
@@ -130,60 +129,36 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){
   
   # these will be  new column names
   new_thread_col = newColName(THREAD_CF)
- # new_event_col = newColName(EVENT_CF)
+  # new_event_col = newColName(EVENT_CF)
   
   print(paste('new_thread_col: ',new_thread_col))
- # print(paste('new_event_col: ', new_event_col))
+  # print(paste('new_event_col: ', new_event_col))
   
   # Add names for the new columns, if necessary
   if  (!(new_thread_col  %in% colnames(occ))) {  occ = combineContextFactors(occ,THREAD_CF,new_thread_col) }
-# if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CF,new_event_col) }
+  # if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CF,new_event_col) }
   
   # ThreadNet code assumes these columns will be there, so we need to add them
   occ$threadNum = integer(nrow(occ))
   occ$seqNum =   integer(nrow(occ))
   
-  #_#_#_#_#_#_ I think what we need to do to get the correct visit_role is sort by  visit and time stamp
-  #_#_#_#_#_#_ But then SPLIT the visit according to visit_role or visit_workstation or whatever
-  #  Now  sort the  data  set by the new  threadNum and tStamp
-  occ = occ[order(occ[[new_thread_col]],occ$tStamp),]
-  
+  # Assume the data are sorted by visit_ID and tStamp -- the natural flow of the visit.
+  idx_list = which(occ[[new_thread_col]] !=dplyr::lag(occ[[new_thread_col]]))
+  idx_list = c(idx_list, nrow(occ))
+  start_idx = c(1, head(idx_list,-1))  # add 1 to the front and drop the last
+  end_idx =   idx_list - 1    # shift back by one
   
-  #_#_#_#_#_#_ This will be incorrect if  the same role is occurs more than once in a single visit. 
-  #_#_#_#_#_#_ Need to break  it out according to the changes in role within a visit...  
-  # get the list of unique identifies for the threads. The length of this list is the number of threads
-  pov_list = unique(occ[[new_thread_col]])
-  print(paste('Number of threads in this POV: ', length(pov_list)))
+  print(paste('Number of threads in this POV: ', length(idx_list)))
   
-  start_row=1
-  thrd=1
-  for (p in pov_list){
-    
-    # get the length of the thread
-    tlen = sum(occ[[new_thread_col]]==p)
-    
-    # guard against error
-    if (length(tlen)==0) tlen=0
-    if (tlen>0){
-     
-      #compute the index of the end row
-      end_row = start_row+tlen-1
-      
-      # they all get the same thread number and incrementing seqNum
-      occ[start_row:end_row, "threadNum"] <- as.matrix(rep(as.integer(thrd),tlen))
-      occ[start_row:end_row, "seqNum"] <- seq(tlen)
-      
-      # increment the counters for the next thread
-      start_row = end_row + 1
-      thrd=thrd+1
-      
-      if (thrd %% 1000 ==0) print(paste0('Thread count =  ',thrd))
-      
+  # sapply(seq(length(idx_list)), function(x)
+    for (x in seq(length(idx_list)))
+    {   if (x %% 1000 ==0) print(paste0('Thread count =  ',x))
+     occ[start_idx[x]:end_idx[x],'threadNum'] = x
+     occ[start_idx[x]:end_idx[x],'seqNum'] = c(1:(end_idx[x]-start_idx[x]+1))
     }
-  }
   
-   # Now save the result for  later and return it, as well. 
-#  save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata'))
+  # Now save the result for  later and return it, as well. 
+  #  save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata'))
   save(occ, file=paste0(paste(fname,new_thread_col,sep='+'), '.Rdata'))
   
   print(paste('Saved threaded  occurrences: ', nrow(occ)))
@@ -192,6 +167,7 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){
 }
 
 
+
   ############   extra  stuff not used  #############
   # split occ data frame by threadNum to find earliest time value for that thread
   # # then substract that from initiated relativeTime from above

From 809bd472aaee612dd63c19f10d5bdac4923a3225 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Sun, 25 Nov 2018 12:56:23 -0500
Subject: [PATCH 13/31] cleaning up the trajectory code...

---
 Data Prep/ACHR_Batch_Aggregate_Data.R   | 519 +++++++++++++++---------
 Data Prep/ACHR_Batch_PreProcess.R       |   2 +
 Data Prep/ACHR_Clinic_Trajectories_v3.R | 102 ++---
 Tables and Figures/Clinic_day_plots.R   |   7 +-
 4 files changed, 386 insertions(+), 244 deletions(-)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index 4b69919..bad8ae5 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -10,12 +10,23 @@
 ##########################################################################################################
 
 
-# Two  ways to aggregate the occurrences 
-# 1) By thread (usually visit)
+# Two  functions to aggregate the occurrences 
+# 1) By thread (usually visit, but could be visit_role)
 # 2) By Clinic_day
   
 
 
+#############################################################################################
+# Functions to make buckets
+# Each bucket is a list of thread numbers that can be used to subset the list of occurrences
+# Need be careful how you call thus because it can aggregate threads in unexpected ways
+# For example, if you want Visit_ID_Role, use criteria = threadNum on the appropriately threaded input
+make_buckets_1 <- function(o, criteria){
+  
+  return( unique(o[[criteria]]) )
+  
+}
+
 
 ###################################################################
 ###################################################################
@@ -46,6 +57,10 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
   # get the list of buckets
   bucket_list <- make_buckets_1(occ, TN)
   
+  # print the number of buckets
+  print(paste0('Number of buckets=', length(bucket_list)  ))
+  
+  
   # make data frame with results
  Thrds = data.frame( t(sapply( bucket_list, 
                            function(b){
@@ -73,6 +88,7 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
                                threadNum = df[1,'threadNum'],
                                Visit_ID  = df[1,'Visit_ID'],
                                Subject_ID  = df[1,'Subject_ID'],
+                               Role_ID = as.character( df[1,'Role_ID'] ),
                                NEvents = nrow(df),
                                ThreadStart= as.character( df[1,'tStamp'] ),
                                ThreadStartInt = df[1,'tStamp'],
@@ -96,8 +112,10 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
                              
                              # name these columns afterwards
                              sapply(ALL_CFs, function(cf){
-                               c( length(unique(df[[cf]])),
-                                  compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ) })
+                               c( length(unique(df[[cf]]))
+                                  # ,
+                                  # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) 
+                                  ) })
                              
                              )
                       } )))
@@ -105,8 +123,10 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
  
  # name the last columns -- code has to match above
  cn =  as.vector(sapply(ALL_CFs, function(cf){
-   c( paste0(cf,"_count"),  
-    paste0(cf,"_entropy") ) }))
+   c( paste0(cf,"_count")
+    #   ,  
+    # paste0(cf,"_entropy") 
+    ) }))
  
  # now assign them to the last columns
  last_col = ncol(Thrds)
@@ -122,12 +142,11 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
   
 }
 
-# Each bucket is a list of thread numbers that can be used to subset the list of occurrences
-make_buckets_1 <- function(o, criteria){
-  
-  return( unique(o[[criteria]]) )
-  
-}
+
+
+get_timeScale <- function(){'hr'}
+
+
 
 # Need to write all  these functions
 compute_phase <- function(t){
@@ -148,18 +167,19 @@ compute_phase <- function(t){
 # alignment is a measure of how much the number of nodes is increased by adding context factors
 # always base this on the "Action" -- how many more "actions" are there? 
 # could also do this based on entropy, or write the function to use the values that were already computed
-compute_alignment <- function(df,TN, EVENT_CFs, ALL_CFs ){
-  
-  # first get numerator
-  n= length(unique(df$Action))
-  
-  # Now get the total number with all  of the CFs  
-  m= length(unique( df[[newColName(EVENT_CFs)]]))
-
-  return(n/m)
-}
+# compute_alignment <- function(df,TN, EVENT_CFs, ALL_CFs ){
+#   
+#   # first get numerator
+#   n= length(unique(df$Action))
+#   
+#   # Now get the total number with all  of the CFs  
+#   m= length(unique( df[[newColName(EVENT_CFs)]]))
+# 
+#   return(n/m)
+# }
 
 
+# NEEDS WORK!!  Far too simplistic. 
 compute_wait_time <- function(df){
   
   #  find the first  occurrence of the chief complain.  Seems to correspond closely to wait time
@@ -170,6 +190,7 @@ compute_wait_time <- function(df){
    
 }
 
+# THIS ONE IS GOOD.
 count_procedures <- function( p ){
   
   # if no procedures, return zero
@@ -198,6 +219,129 @@ count_diagnoses <- function(d){
 }
 
 
+
+
+
+
+###################################################################
+###################################################################
+# This is function is set up to aggregate the occurrences among collections of visit -- typically clinic_days
+# occ = pre-processed threaded occurrences
+# TN = threadNum in most cases
+# CFs can be chosen -- they don't have to match the POV
+
+#  need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time.
+
+ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
+  
+  library(tidyr)
+  library(dplyr)
+  library(ThreadNet)
+  library(ngram)
+  library(lubridate)
+  library(stringr)
+  library(data.table)
+  
+  # make a list of unique buckets.  Typical example
+  bucket_list <- unique(occ[[TN]])
+  
+  # print the number of buckets
+  print(paste0('Number of buckets=', length(bucket_list)  ))
+  
+  # Add  columns for combinations of CFs if needed
+  new_event_col = newColName(EVENT_CFs)
+  if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+  
+  all_cf_col = newColName(ALL_CFs)
+  if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
+  
+  
+  # make data frame with results
+ cds = data.frame( t(sapply( bucket_list, 
+                                function(b){
+                                  
+                                  # select a subset of occurrences for the bucket
+                                  df = occ[ occ[[TN]] ==b , ] 
+                                  
+                                  
+                                  # get the network -- only if there are enough rows...
+                                  if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+                                  else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
+                                  
+                                  # compute each parameter and put them in a vector
+                                  c(
+                                    bucket=b,
+                                    Clinic = df[1,'Clinic'],
+                                    Physician  = as.character(df[1,'Physician']),
+                                    ymd = df[1,'ymd'],
+                                    Clinic_ymd = df[1,'Clinic_ymd'],
+                                    Weekday  = df[1,'Weekday'],
+                                    Month  = df[1,'Month'],
+                                    Phase =  compute_phase(df$tStamp[1]),
+                                    threadNum = df[1,'threadNum'],
+                                    Visit_ID  = df[1,'Visit_ID'],
+                                    Subject_ID  = df[1,'Subject_ID'],
+                                    NEvents = nrow(df),
+                                    bucketStart= as.character( df[1,'tStamp'] ),
+                                    bucketStartInt = df[1,'tStamp'],
+                                    bucketDuration= difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours' ),
+                                    #   wait_time = compute_wait_time(df),
+                                    #   Visit_number = integer(N),
+                                    #   LOC_CPT =  character(N),
+                                    #  NumUniqueProcedures = length(unique(df[['Proc']])
+                                    NumVisits =  length(unique(df[['Visit_ID']])),
+                                    NumUniqueDiagnosisGroups = length(unique(df[['Diagnosis_Group']])), 
+                                    NumPhysicians = length(unique(df[['Physician']])),
+                                    TotalStaff = length(unique(df[['Role_ID']])),
+                                    NetComplexity=estimate_network_complexity( n ),
+                                    Nodes=nrow(n$nodeDF),
+                                    Edges=nrow(n$edgeDF),
+                                    CompressRatio = compression_index(df,new_event_col),
+                                    Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]),
+                                    # NumProcedures = count_procedures(df$Proc[1]),
+                                    # NumDiagnoses = count_diagnoses(df$Diag[1]),
+                                    # Diagnosis_group  = df[1,'Diagnosis_Group'],
+                                    CF_Alignment = 1,    # make placeholder, but compute below
+                                    ALL_CF_count = length(unique(df[[all_cf_col]])),
+                                    ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), 
+                                    
+                                    # name these columns afterwards
+                                    sapply(ALL_CFs, function(cf){
+                                      c( length(unique(df[[cf]]))
+                                         # ,
+                                         # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) 
+                                         ) })
+                                    
+                                  )
+                                } )))
+  
+  
+  # name the last columns -- code has to match above
+  cn =  as.vector(sapply(ALL_CFs, function(cf){
+    c( paste0(cf,"_count")
+       # ,  
+       # paste0(cf,"_entropy") 
+       ) }))
+  
+  # now assign them to the last columns
+  last_col = ncol(cds)
+  first_col= last_col-length(cn)+1
+  setnames(cds, c(first_col:last_col), cn)
+  
+  # Compute the alignment of the context factors
+  cds$CF_Alignment =  as.numeric( as.character(cds$Action_count)) / as.numeric( as.character(cds$ALL_CF_count ))
+  
+  save(cds, file=paste0(paste('cds',TN,new_event_col,sep='+'), '.Rdata'))
+  
+  return(cds) 
+  
+}
+
+
+###################################################################
+###################################################################
+##  OLD   ###
+##  OLD   ###
 ##  OLD   ###
 # ACHR_batch_threads_old <- function(occ,TN, EVENT_CFs, ALL_CFs) {
 # 
@@ -335,176 +479,167 @@ count_diagnoses <- function(d){
 # }
 
 
-
-
-
 ###################################################################
 ###################################################################
-ACHR_batch_clinic_days <- function(occ,TN, EVENT_CFs, ALL_CFs) {
-  
-  # Name for column that has events -- three variations
-  DV1= newColName(CFs[1])
-  DV2= newColName(CFs[1:2])
-  DV3= newColName(CFs[1:3])
-  
-  print(DV1)
-  print(DV2)
-  print(DV3)
-  
-  # name for column that defines the buckets
-  BU = 'Clinic_ymd'
-  
-  # first  get the date only 
-  #    occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d")
-  
-  # make new columns as needed for clinic + day and events
-  #    occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
-  
-  
-  
-  # pick subsets -- one visit at a time in this version, but could be more
-  bucket_list <- unique(occ[['Clinic_ymd']])
-  
-  # get the size (number of buckets)
-  N = length(bucket_list)
-  print(N)
-  
-  # pre-allocate the data.table.  Tables are supposed to be faster.
-  ACHR = data.table(bucket=integer(N),
-                    Clinic_date =  character(N),
-                    YMD_date = character(N),
-                    NEvents = integer(N),
-                    NumVisits = numeric(N),
-                    A_NetComplexity=double(N),
-                    AR_NetComplexity=double(N),
-                    ARW_NetComplexity=double(N),
-                    A_Nodes=double(N),
-                    AR_Nodes=double(N),
-                    ARW_Nodes=double(N),
-                    A_Edges=double(N),
-                    AR_Edges=double(N),
-                    ARW_Edges=double(N),
-                    A_CompressRatio = double(N),
-                    AR_CompressRatio = double(N),
-                    ARW_CompressRatio = double(N),
-                    A_Entropy = double(N),
-                    AR_Entropy = double(N),
-                    ARW_Entropy = double(N),
-                    Clinic = character(N),
-                    NumUniqueProcedures = numeric(N),
-                    NumUniqueDiagnosisGroups  = numeric(N), 
-                    NumPhysicians  = numeric(N), 
-                    Weekday  = character(N), 
-                    Month  = character(N)
-  )
-  
-  # Now add columns for the IVs.  There will be three for each IV
-  
-  # Add the IV columns
-  for (cf in CFs){
-    
-    ACHR[, paste0(cf,"_count"):= double(N)]
-    ACHR[, paste0(cf,"_compression"):= double(N)]
-    ACHR[, paste0(cf,"_entropy"):= double(N)]
-    
-  }
-  
-  # loop through the buckets. Result will be data frame with one row per bucket
-  for (i in 1:N){
-    
-    b = i #  as.integer(bucket_list[i])
-    
-    # print once every 10 buckets
-    if (b%%10==0) {print(b)}
-    
-    # select the threads that go in this bucket
-    df = occ[occ[[BU]] ==bucket_list[i],]
-    
-    # bucket number
-    ACHR[b,bucket := b]
-    
-    # length of the thread (number of rows)
-    ACHR[b,NEvents := nrow(df)]
-    
-    # only do the computations if there are more than two occurrences
-    if (nrow(df) > 2) {
-      
-      # compute the duration of the visit in hours
-      #     ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
-      
-      # compressibility of DV
-      ACHR[b,A_CompressRatio := compression_index(df,DV1)]
-      ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
-      ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
-      
-      # NetComplexity of DV
-      # First get the network
-      n = threads_to_network_original(df,TN, DV1)
-      ACHR[b,A_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,A_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,A_Edges := nrow(n$edgeDF) ]
-      
-      n = threads_to_network_original(df,TN, DV2)
-      ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
-      
-      n = threads_to_network_original(df,TN, DV3)
-      ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
-      
-      # get the entropy for AR and ARW
-      ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
-      ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
-      
-      
-      # compute stuff on each context factor
-      for (cf in CFs){
-        
-        # Count the unique elements in each cf
-        ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ]
-        
-        # get the compression
-        ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
-        
-        # get the entropy
-        ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
-        
-      }
-    } # kf nrows > 2
-    
-    # Now copy in the rest of data  
-    # this works because one visit is one bucket
-    # count the number of different diagnoses --> typical indicators of complexity
-    
-    
-    ACHR[b,'Clinic_date'  := df[1,'Clinic_ymd']]
-    ACHR[b,'YMD_date'  := df[1,'ymd']]
-    ACHR[b,'NumVisits'  :=  length(unique(df[['Visit_ID']]))]
-    ACHR[b,'Clinic' := df[1,'Clinic']]
-    ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))]
-    ACHR[b,'NumUniqueDiagnosisGroups'  := length(unique(df[['Diagnosis_Group']]))]  
-    ACHR[b,'NumPhysicians'  := length(unique(df[['Physician']]))]  
-    ACHR[b,'Weekday'  := df[1,'Weekday']]
-    ACHR[b,'Month'  := df[1,'Month']]
-    
-    
-    
-  } # loop thru buckets
-  
-  # copy this one for consistency
-  ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ]
-  
-  # return the table
-  return(ACHR)
-}
-
-###################################################################
-###################################################################
-
-
-
+# ACHR_batch_clinic_days <- function(occ,TN, EVENT_CFs, ALL_CFs) {
+#   
+#   # Name for column that has events -- three variations
+#   DV1= newColName(CFs[1])
+#   DV2= newColName(CFs[1:2])
+#   DV3= newColName(CFs[1:3])
+#   
+#   print(DV1)
+#   print(DV2)
+#   print(DV3)
+#   
+#   # name for column that defines the buckets
+#   BU = 'Clinic_ymd'
+#   
+#   # first  get the date only 
+#   #    occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d")
+#   
+#   # make new columns as needed for clinic + day and events
+#   #    occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
+#   
+#   
+#   
+#   # pick subsets -- one visit at a time in this version, but could be more
+#   bucket_list <- unique(occ[['Clinic_ymd']])
+#   
+#   # get the size (number of buckets)
+#   N = length(bucket_list)
+#   print(N)
+#   
+#   # pre-allocate the data.table.  Tables are supposed to be faster.
+#   ACHR = data.table(bucket=integer(N),
+#                     Clinic_date =  character(N),
+#                     YMD_date = character(N),
+#                     NEvents = integer(N),
+#                     NumVisits = numeric(N),
+#                     A_NetComplexity=double(N),
+#                     AR_NetComplexity=double(N),
+#                     ARW_NetComplexity=double(N),
+#                     A_Nodes=double(N),
+#                     AR_Nodes=double(N),
+#                     ARW_Nodes=double(N),
+#                     A_Edges=double(N),
+#                     AR_Edges=double(N),
+#                     ARW_Edges=double(N),
+#                     A_CompressRatio = double(N),
+#                     AR_CompressRatio = double(N),
+#                     ARW_CompressRatio = double(N),
+#                     A_Entropy = double(N),
+#                     AR_Entropy = double(N),
+#                     ARW_Entropy = double(N),
+#                     Clinic = character(N),
+#                     NumUniqueProcedures = numeric(N),
+#                     NumUniqueDiagnosisGroups  = numeric(N), 
+#                     NumPhysicians  = numeric(N), 
+#                     Weekday  = character(N), 
+#                     Month  = character(N)
+#   )
+#   
+#   # Now add columns for the IVs.  There will be three for each IV
+#   
+#   # Add the IV columns
+#   for (cf in CFs){
+#     
+#     ACHR[, paste0(cf,"_count"):= double(N)]
+#     ACHR[, paste0(cf,"_compression"):= double(N)]
+#     ACHR[, paste0(cf,"_entropy"):= double(N)]
+#     
+#   }
+#   
+#   # loop through the buckets. Result will be data frame with one row per bucket
+#   for (i in 1:N){
+#     
+#     b = i #  as.integer(bucket_list[i])
+#     
+#     # print once every 10 buckets
+#     if (b%%10==0) {print(b)}
+#     
+#     # select the threads that go in this bucket
+#     df = occ[occ[[BU]] ==bucket_list[i],]
+#     
+#     # bucket number
+#     ACHR[b,bucket := b]
+#     
+#     # length of the thread (number of rows)
+#     ACHR[b,NEvents := nrow(df)]
+#     
+#     # only do the computations if there are more than two occurrences
+#     if (nrow(df) > 2) {
+#       
+#       # compute the duration of the visit in hours
+#       #     ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
+#       
+#       # compressibility of DV
+#       ACHR[b,A_CompressRatio := compression_index(df,DV1)]
+#       ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
+#       ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
+#       
+#       # NetComplexity of DV
+#       # First get the network
+#       n = threads_to_network_original(df,TN, DV1)
+#       ACHR[b,A_NetComplexity := estimate_network_complexity( n )]
+#       ACHR[b,A_Nodes := nrow(n$nodeDF) ]
+#       ACHR[b,A_Edges := nrow(n$edgeDF) ]
+#       
+#       n = threads_to_network_original(df,TN, DV2)
+#       ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
+#       ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
+#       ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
+#       
+#       n = threads_to_network_original(df,TN, DV3)
+#       ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
+#       ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
+#       ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
+#       
+#       # get the entropy for AR and ARW
+#       ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
+#       ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
+#       
+#       
+#       # compute stuff on each context factor
+#       for (cf in CFs){
+#         
+#         # Count the unique elements in each cf
+#         ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ]
+#         
+#         # get the compression
+#         ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
+#         
+#         # get the entropy
+#         ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+#         
+#       }
+#     } # kf nrows > 2
+#     
+#     # Now copy in the rest of data  
+#     # this works because one visit is one bucket
+#     # count the number of different diagnoses --> typical indicators of complexity
+#     
+#     
+#     ACHR[b,'Clinic_date'  := df[1,'Clinic_ymd']]
+#     ACHR[b,'YMD_date'  := df[1,'ymd']]
+#     ACHR[b,'NumVisits'  :=  length(unique(df[['Visit_ID']]))]
+#     ACHR[b,'Clinic' := df[1,'Clinic']]
+#     ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))]
+#     ACHR[b,'NumUniqueDiagnosisGroups'  := length(unique(df[['Diagnosis_Group']]))]  
+#     ACHR[b,'NumPhysicians'  := length(unique(df[['Physician']]))]  
+#     ACHR[b,'Weekday'  := df[1,'Weekday']]
+#     ACHR[b,'Month'  := df[1,'Month']]
+#     
+#     
+#     
+#   } # loop thru buckets
+#   
+#   # copy this one for consistency
+#   ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ]
+#   
+#   # return the table
+#   return(ACHR)
+# }
 
 # 
 # make_box_plots <- function(){
@@ -516,7 +651,3 @@ ACHR_batch_clinic_days <- function(occ,TN, EVENT_CFs, ALL_CFs) {
 
 
 
-
-
-get_timeScale <- function(){'hr'}
-
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index ded690e..a639574 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -145,6 +145,8 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){
   # Assume the data are sorted by visit_ID and tStamp -- the natural flow of the visit.
   idx_list = which(occ[[new_thread_col]] !=dplyr::lag(occ[[new_thread_col]]))
   idx_list = c(idx_list, nrow(occ))
+  
+  # creates two lists of indices so they match up correctly. 
   start_idx = c(1, head(idx_list,-1))  # add 1 to the front and drop the last
   end_idx =   idx_list - 1    # shift back by one
   
diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R
index 48525c5..5553868 100644
--- a/Data Prep/ACHR_Clinic_Trajectories_v3.R	
+++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R	
@@ -22,15 +22,6 @@ library(philentropy)
 library(stringr)
 library(zoo)
 
-# need  function to count the number  of procedures and number  of diagnoses in the data.  
-# some visits  have  dozens of  them 
-# need to pass in data frame that just has the  column with the procedures or diagnoses
-count_procs <- function(o){
-  print(nrow(o))
-  
-unlist(lapply(1:nrow(o), function(i){ str_count(o[i], pattern='#@#')  }))
-                 
-}
 
 # Use this to put the trajectory data with the clinic_day data  
 merge_clinic_days_and_trajectories <- function(){
@@ -58,20 +49,23 @@ get_bucket <- function(o, b ){
 
 
 # @param e data frame for POV
-# @param blist is the bucket list.  Each bucket is a "window"
+# @param bucketCF is the column used to define the bucket list.  Each bucket is a "window"
 # @param cf is the column that defines events (e.g. 'Action_Role')
-# @param filter list contains the threshold levels for filtering out edges with low frequency: c(0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1)
-graph_trajectory_filtered  <- function(e, cf, reference_day=1, filter_list,save_file_name) {
+# @param filter list contains the threshold level  for filtering out edges with low frequency.
+#
+# need to make sure this works with roles over time... 
+#
+graph_trajectory_filtered  <- function(e, bucketCF, cf, n_gram_size=2, reference_day=1, filter_list=0,save_file_name) {
   
   # get total size of possible matrix
   Max_Order = length(unique(e[[cf]]))^2
   
   
-  # make data frame for results
+  # make data frame for results - bid is the bucket_ID
   vt=data.frame( ngrams=character(), freq=integer(), bid=integer() )
   
   # here is the list of buckets
-  blist = unique(e[['Clinic_ymd']])
+  blist = unique(e[[bucketCF]])
   
   # take the underscore  out so we can use it to strsplit below... and sort it
   blist=gsub('HH_POB','HHPOB',blist)
@@ -91,8 +85,10 @@ graph_trajectory_filtered  <- function(e, cf, reference_day=1, filter_list,save_
    # print(paste('b =',b))
     
     # get text vector for the whole data set.  Bucket needs at least 2 threads
+    # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
       th = get_bucket(e, b)
-      if (nrow(th)>2) { ngdf = count_ngrams(th, 'threadNum', cf, 2)[1:2]  }
+      if (nrow(th)>2) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]  }  
+      
       
       nodes = length(unique(th[[cf]]))
       
@@ -139,43 +135,53 @@ graph_trajectory_filtered  <- function(e, cf, reference_day=1, filter_list,save_
     
   }
   
-  #Make a matrix to filter by where all the entries are 0 < f < 1
-  fm = windowFreqMatrix/max(windowFreqMatrix)
+    # #Make a matrix to filter by where all the entries are 0 < f < 1
+    # fm = windowFreqMatrix/max(windowFreqMatrix)
+    # 
+    # # if the number if below the threshold, set it to zero
+    # windowFreqMatrix[fm<=f] = 0
+
+    df = data.frame( t(sapply( 1:(nWindows-1), 
+                                  function(i){
+                                    c(
+                                    bckt = blist[i],
+                                    Clinic = unlist(strsplit(blist[i],'_'))[1],
+                                    ymd =  unlist(strsplit(blist[i],'_'))[2],
+                                    pct_retained = sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0),
+                                    pct_possible = sum(windowFreqMatrix[i,] > 0)/Max_Order,
+                                    complexity = estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ),
+                                    Dist_from_reference = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ),
+                                    Dist_from_next = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' )
+                                    )
+                                  })))
   
-  # Now filter out the edges 
-  for (f in filter_list) {
+  #  correlate each row with the first one stick it in a dataframe
+  # df =data.frame(window=1:(nWindows-1),
+  #                Clinic_ymd = blist[1:(nWindows-1)],
+  #                Clinic= unlist(lapply(1:(nWindows-1),
+  #                                      function(i){ unlist(strsplit(blist[i],'_'))[1] })),
+  #                ymd=unlist(lapply(1:(nWindows-1),
+  #                                  function(i){ unlist(strsplit(blist[i],'_'))[2] })),
+  #                pct_retained = unlist(lapply(1:(nWindows-1),
+  #                                             function(i){sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/
+  #                                                 sum(windowFreqMatrix[i,]>0)
+  #                                                 })) ,
+  #                pct_possible = unlist(lapply(1:(nWindows-1),
+  #                                             function(i){sum(windowFreqMatrix[i,] > 0)/Max_Order })) ,
+  #                complexity = unlist(lapply(1:(nWindows-1),
+  #                                           function(i){ estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) })) ,
+  #                Dist_from_reference= unlist(lapply(1:(nWindows-1),
+  #                                               function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]),
+  #                                                                    method='cosine' ) })),
+  #                 Dist_from_next= unlist(lapply(1:(nWindows-1),
+  #                                                function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]),
+  #                                                                     method='cosine' ) }))
+  #                )
+  
+  save(df, file=paste0(save_file_name,'.rData') )
   
-    # if the number if below the threshold, set it to zero
-    windowFreqMatrix[fm<=f] = 0
 
   
-  #  correlate each row with the first one stick it in a dataframe
-  df =data.frame(window=1:(nWindows-1),
-                 Clinic_ymd = blist[1:(nWindows-1)],
-                 Clinic= unlist(lapply(1:(nWindows-1),
-                                       function(i){ unlist(strsplit(blist[i],'_'))[1] })),
-                 ymd=unlist(lapply(1:(nWindows-1),
-                                   function(i){ unlist(strsplit(blist[i],'_'))[2] })),
-                 pct_retained = unlist(lapply(1:(nWindows-1),
-                                              function(i){sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/
-                                                  sum(windowFreqMatrix[i,]>0)
-                                                  })) ,
-                 pct_possible = unlist(lapply(1:(nWindows-1),
-                                              function(i){sum(windowFreqMatrix[i,] > 0)/Max_Order })) ,
-                 complexity = unlist(lapply(1:(nWindows-1),
-                                            function(i){ estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) })) ,
-                 Dist_from_reference= unlist(lapply(1:(nWindows-1),
-                                                function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]),
-                                                                     method='cosine' ) })),
-                  Dist_from_next= unlist(lapply(1:(nWindows-1),
-                                                 function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]),
-                                                                      method='cosine' ) }))
-                 )
-  
-  save(df, file=paste0(save_file_name,f,'.rData') )
-  
-  }  # f in filter list
-  
  return(df)
   
   # get the ngram data and labels
diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R
index e0fb1ac..a6e05a5 100644
--- a/Tables and Figures/Clinic_day_plots.R	
+++ b/Tables and Figures/Clinic_day_plots.R	
@@ -34,8 +34,11 @@ plots_for_papers <- function(){
   plot(cds$Clinic.x,cds$A_NetComplexity)
   
   # Clinic change over time, with and without smoothing
-  ggplot(data = cds, aes(x = ymd, y = corr_with_first, group=Clinic)) + geom_line(aes(color=Clinic))
-  ggplot(data = cds, aes(x = ymd, y = rollmean(corr_with_first,5,na.pad=TRUE), group=Clinic)) + geom_line(aes(color=Clinic))
+  ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Clinic)) + geom_line(aes(color=Clinic))
+  ggplot(data = cds, aes(x = ymd, y = rollmean(Dist_from_reference,5,na.pad=TRUE), group=Clinic)) + geom_line(aes(color=Clinic))
+  
+  # ROle change over time
+  ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Role)) + geom_line(aes(color=Role))
   
 }
 

From 71c25050dba090b8963ed097572c30c642528585 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Sun, 25 Nov 2018 13:26:34 -0500
Subject: [PATCH 14/31] making some small changes and comments

---
 Data Prep/ACHR_Clinic_Trajectories_v3.R | 33 ++++++-------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R
index 5553868..e203b73 100644
--- a/Data Prep/ACHR_Clinic_Trajectories_v3.R	
+++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R	
@@ -8,6 +8,11 @@
 
 # these functions will take clinic data and produce trajectories (distance of the graph from a reference graph)
 
+# need to rethink how this selects occurrences in buckets.  
+#  Should buckets always be by clinic_day??  This allows us to track what happens in each clinic over time.  
+
+#  We want to track roles over time in each clinic.  So it's Clinic_day and also visit_role.  We have threads by visit_role
+
 # start with auditfinal_10022018.csv --> use ACHR_Batch_v2 to make new_occ.  
 # The code here starts with new_occ (~ 7.72m occurrences that we break down into 1857 clinic_days and 57835 visits)
 
@@ -84,7 +89,8 @@ graph_trajectory_filtered  <- function(e, bucketCF, cf, n_gram_size=2, reference
     bcount= bcount +1
    # print(paste('b =',b))
     
-    # get text vector for the whole data set.  Bucket needs at least 2 threads
+     # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE...
+     # get text vector for the whole data set.  Bucket needs at least 2 threads
     # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
       th = get_bucket(e, b)
       if (nrow(th)>2) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]  }  
@@ -155,33 +161,10 @@ graph_trajectory_filtered  <- function(e, bucketCF, cf, n_gram_size=2, reference
                                     )
                                   })))
   
-  #  correlate each row with the first one stick it in a dataframe
-  # df =data.frame(window=1:(nWindows-1),
-  #                Clinic_ymd = blist[1:(nWindows-1)],
-  #                Clinic= unlist(lapply(1:(nWindows-1),
-  #                                      function(i){ unlist(strsplit(blist[i],'_'))[1] })),
-  #                ymd=unlist(lapply(1:(nWindows-1),
-  #                                  function(i){ unlist(strsplit(blist[i],'_'))[2] })),
-  #                pct_retained = unlist(lapply(1:(nWindows-1),
-  #                                             function(i){sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/
-  #                                                 sum(windowFreqMatrix[i,]>0)
-  #                                                 })) ,
-  #                pct_possible = unlist(lapply(1:(nWindows-1),
-  #                                             function(i){sum(windowFreqMatrix[i,] > 0)/Max_Order })) ,
-  #                complexity = unlist(lapply(1:(nWindows-1),
-  #                                           function(i){ estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) })) ,
-  #                Dist_from_reference= unlist(lapply(1:(nWindows-1),
-  #                                               function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]),
-  #                                                                    method='cosine' ) })),
-  #                 Dist_from_next= unlist(lapply(1:(nWindows-1),
-  #                                                function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]),
-  #                                                                     method='cosine' ) }))
-  #                )
+    ##### maybe use merge to get other variables included?  Or do that outside the function...
   
   save(df, file=paste0(save_file_name,'.rData') )
   
-
-  
  return(df)
   
   # get the ngram data and labels

From 250423c29658d4ac44cf62ac479011cef536f502 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Sun, 25 Nov 2018 13:27:02 -0500
Subject: [PATCH 15/31] updated ignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 7cc7327..998975a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,4 @@ Threads+threadNum+Action_Role.Rdata
 emrt+Visit_ID+Role_Action_Workstation.Rdata
 emr+Visit_ID.Rdata
 Thrds+threadNum+Action_Role_Workstation.Rdata
+Nov 25 am workspace.RData

From be2c552838c0771409e4726b07839b006a58303a Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Sat, 1 Dec 2018 00:51:28 -0500
Subject: [PATCH 16/31] lots  of  little changes... not done  yet.

---
 Data Prep/ACHR_Batch_Aggregate_Data.R   | 190 +++++++++++++++---------
 Data Prep/ACHR_Batch_PreProcess.R       |  61 ++++----
 Data Prep/ACHR_Clinic_Trajectories_v3.R |  66 ++++----
 Data Prep/Comparison_functions.R        |  84 -----------
 Stats/precedence_functions.R            |  38 +++++
 5 files changed, 221 insertions(+), 218 deletions(-)
 create mode 100644 Stats/precedence_functions.R

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index bad8ae5..cfc7ea5 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -15,29 +15,19 @@
 # 2) By Clinic_day
   
 
-
-#############################################################################################
-# Functions to make buckets
-# Each bucket is a list of thread numbers that can be used to subset the list of occurrences
-# Need be careful how you call thus because it can aggregate threads in unexpected ways
-# For example, if you want Visit_ID_Role, use criteria = threadNum on the appropriately threaded input
-make_buckets_1 <- function(o, criteria){
-  
-  return( unique(o[[criteria]]) )
-  
-}
-
-
 ###################################################################
 ###################################################################
 # This is function is set up to aggregate the occurrences within threads -- typically visits
 # occ = pre-processed threaded occurrences
 # TN = threadNum in most cases
-# CFs can be chosen -- they don't have to match the POV
+# CFs can be chosen
+# event_CFs define changes within threads
+# ALL_CFs are used to computer the CF_alignment
+
 
 #  need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time.
 
-ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
+ACHR_batch_threads <- function(occ,THREAD_CFs, EVENT_CFs, ALL_CFs) {
   
   library(tidyr)
   library(dplyr)
@@ -48,28 +38,37 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
   library(data.table)
   
   # Add  columns for combinations of CFs if needed
+  thread_col = newColName(THREAD_CFs)
+  TN = thread_col
+  if  (!(thread_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,THREAD_CFs,thread_col) }
+
+  
   new_event_col = newColName(EVENT_CFs)
   if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
  
   all_cf_col = newColName(ALL_CFs)
   if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
   
+  # set key on the data.table
+  setkeyv(occ, TN)
+  
   # get the list of buckets
-  bucket_list <- make_buckets_1(occ, TN)
+  bucket_list <- unique(occ[[TN]])
   
   # print the number of buckets
   print(paste0('Number of buckets=', length(bucket_list)  ))
   
   
   # make data frame with results
- Thrds = data.frame( t(sapply( bucket_list, 
+ Thrds = data.table( t(sapply( bucket_list, 
                            function(b){
                              
                              # select a subset 
-                             df = occ[ occ[[TN]] ==b , ] 
+                             df= occ[get(TN)==b]
+                    
+                            # make sure it  is sorted by timestamp
+                             df=df[order(df$tStamp),]
                              
-                            # Only run for visits with more than two occurrences
-                        
                              # get the network -- only if there are enough rows...
                              
                               if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
@@ -78,24 +77,25 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
                              # compute each parameter and put them in a vector
                              c(
                                bucket=b,
-                               Clinic = df[1,'Clinic'],
-                               Physician  = as.character(df[1,'Physician']),
-                               ymd = df[1,'ymd'],
-                               Clinic_ymd = df[1,'Clinic_ymd'],
-                               Weekday  = df[1,'Weekday'],
-                               Month  = df[1,'Month'],
+                               Clinic = as.character(df$Clinic[1]),
+                               Physician  = as.character(df$Physician[1]),
+                               ymd = as.character(df$ymd[1]),
+                               Clinic_ymd = as.character(df$Clinic_ymd[1]),
+                               Weekday  = as.character(df$Weekday[1]),
+                               Month  = as.character(df$Month[1]),
                                Phase =  compute_phase(df$tStamp[1]),
-                               threadNum = df[1,'threadNum'],
-                               Visit_ID  = df[1,'Visit_ID'],
-                               Subject_ID  = df[1,'Subject_ID'],
-                               Role_ID = as.character( df[1,'Role_ID'] ),
+                               threadNum = as.numeric(df$threadNum[1]),
+                               Visit_ID  = as.character(df$Visit_ID[1]),
+                               Subject_ID  = as.character( df$Subject_ID[1] ),
+                               Role_ID = as.character( df$Role_ID[1] ),
                                NEvents = nrow(df),
-                               ThreadStart= as.character( df[1,'tStamp'] ),
-                               ThreadStartInt = df[1,'tStamp'],
-                               ThreadDuration= difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours' ),
-                               wait_time = compute_wait_time(df),
-                               #   Visit_number = integer(N),
-                               #   LOC_CPT =  character(N),
+                               ThreadStart= as.character(df$tStamp[1] ),
+                               ThreadDuration= compute_thread_duration(df), 
+                               VisitDuration= compute_visit_duration(df), 
+                               wait_time1 = compute_wait_time1(df),
+                               wait_time2 = compute_wait_time2(df),
+                               Visit_number = as.numeric(df[1,'Visitnum_Total']),
+                               LOC_CPT =  as.character(df[1,'LOS_CPT']),
                                NetComplexity=estimate_network_complexity( n ),
                                Nodes=nrow(n$nodeDF),
                                Edges=nrow(n$edgeDF),
@@ -105,7 +105,7 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
                                NumDiagnoses = count_diagnoses(df$Diag[1]),
                                Proc = as.character(df[1,'Proc']),
                                Diagnosis = as.character(df[1,'Diag']),    
-                               Diagnosis_group  = df[1,'Diagnosis_Group'],
+                               Diagnosis_group  = as.character(df[1,'Diagnosis_Group']),
                                CF_Alignment = 1,    # make placeholder, but compute below
                                ALL_CF_count = length(unique(df[[all_cf_col]])),
                                ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), 
@@ -136,7 +136,10 @@ ACHR_batch_threads <- function(occ,TN, EVENT_CFs, ALL_CFs) {
  # Compute the alignment of the context factors
  Thrds$CF_Alignment =  as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count ))
  
- save(Thrds, file=paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata'))
+ save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')
+ save(Thrds, file=save_file_name)
+ 
+ print(paste('Saved ', nrow(Thrds), " records in ",save_file_name))
  
  return(Thrds) 
   
@@ -162,34 +165,88 @@ compute_phase <- function(t){
     # else return NA
     return(NA)
 
+}
+compute_thread_duration <- function(df){
+  
+  return(difftime(lubridate::ymd_hms(df$tStamp[nrow(df)]),  lubridate::ymd_hms(df$tStamp[1]), units='hours' ))
+  
 }
 
-# alignment is a measure of how much the number of nodes is increased by adding context factors
-# always base this on the "Action" -- how many more "actions" are there? 
-# could also do this based on entropy, or write the function to use the values that were already computed
-# compute_alignment <- function(df,TN, EVENT_CFs, ALL_CFs ){
-#   
-#   # first get numerator
-#   n= length(unique(df$Action))
-#   
-#   # Now get the total number with all  of the CFs  
-#   m= length(unique( df[[newColName(EVENT_CFs)]]))
-# 
-#   return(n/m)
-# }
-
+compute_visit_duration <- function(df){
+  
+  # get the checkin time.  Note that they sometimes look at patient data in advance of the visit, so you 
+  # cannot use the first occurrences to mark the start of the thread
+  w1= grep('CHECKIN_TIME',df$Action)[1]
+  w2= grep('AVS_PRINT_TIME',df$Action)[1]
+  
+  return(difftime(lubridate::ymd_hms(df$tStamp[w2]),  lubridate::ymd_hms(df$tStamp[w1]), units='hours' ))
+  
+}
 
-# NEEDS WORK!!  Far too simplistic. 
-compute_wait_time <- function(df){
+# First wait time... from checkin until they get their vitals taken
+compute_wait_time1 <- function(df){
   
-  #  find the first  occurrence of the chief complain.  Seems to correspond closely to wait time
-  #  Make sure it's not in the first 5 occurrences, and make sure to  return a  value of at least 1 if it never occurs
-  w= grep('MR_VN_CHIEF_COMPLAINT',df$Action)[1]
+  #  find the first  occurrence of MR_VN_VITALS. Corresponds closely to wait time
+ # w= grep('MR_VN_VITALS',df$Action)[1]
+  w= grep('VITALS',df$Action)[1]
   
   return( difftime( lubridate::ymd_hms(df$tStamp[w]),lubridate::ymd_hms(df$tStamp[1]),  units='hours' ) )
    
 }
 
+# Second wait time... from  vitals until they see a doctor or other medical person
+compute_wait_time2 <- function(df){
+  
+  #  find the first  occurrence of MR_VN_VITALS. Get the time t1
+ # w1= grep('MR_VN_VITALS',df$Action)[1]
+  w1= grep('VITALS',df$Action)[1]
+  
+  if (is.na(w1))
+    return(NA)
+  else
+  {
+  t1 = lubridate::ymd_hms(df$tStamp[w1])
+  
+  
+  # get the workstation and role used for the vitals at t1
+  work1 = df$Workstation[w1]
+  role1 = df$Role[w1]
+  
+  # get set of actions at the same workstation or different workstations that happen later in the visit
+  # Truncate the visit up to w1
+  restdf = df[w1:nrow(df),]
+
+  # then get the set of action at the same workstation or different workstations (in that visit)
+  sameWS =  restdf[restdf$Workstation == work1,]
+  differentWS =  restdf[restdf$Workstation != work1,]
+  
+  # get list of other roles
+  otherRoles = paste(setdiff(unique(restdf$Role),c('Technician','Technologist','Staff','Unknown')),
+                    collapse = '|')
+  
+  # Now find the marker for when they see a physician
+  # procedure is differenrt in BRKPT and HHPOB clinics
+  if (df$Clinic[1]  %in% c('BRKPT','HHPOB') )
+    { # LPN brings a different workstation on wheels at these clinics  
+    firstLPN =  grep('LPN',differentWS$Role)[1] 
+    t2 = lubridate::ymd_hms(differentWS$tStamp[firstLPN])
+    
+    # Other roles use the same workstation
+    firstDoc =  grep('Physician|Resident|Registered', sameWS$Role)[1] 
+    t2 = lubridate::ymd_hms(sameWS$tStamp[firstDoc])
+    }
+  else 
+  {  # a different role, using the same workstation 
+    w2 = max(w1, grep(otherRoles, sameWS$Role) )[1]
+    t2 = lubridate::ymd_hms(sameWS$tStamp[w2])
+    }
+    
+  # return the difference in time from t1 to t2
+  return( difftime( t2, t1,  units='hours' ) )
+  }
+}
+
+
 # THIS ONE IS GOOD.
 count_procedures <- function( p ){
   
@@ -228,10 +285,7 @@ count_diagnoses <- function(d){
 # This is function is set up to aggregate the occurrences among collections of visit -- typically clinic_days
 # occ = pre-processed threaded occurrences
 # TN = threadNum in most cases
-# CFs can be chosen -- they don't have to match the POV
-
-#  need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time.
-
+# CFs can be chosen  
 ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
   
   library(tidyr)
@@ -242,7 +296,7 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
   library(stringr)
   library(data.table)
   
-  # make a list of unique buckets.  Typical example
+  # make a list of unique buckets.   
   bucket_list <- unique(occ[[TN]])
   
   # print the number of buckets
@@ -257,7 +311,7 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
   
   
   # make data frame with results
- cds = data.frame( t(sapply( bucket_list, 
+ cds = data.table( t(sapply( bucket_list, 
                                 function(b){
                                   
                                   # select a subset of occurrences for the bucket
@@ -641,13 +695,3 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
 #   return(ACHR)
 # }
 
-# 
-# make_box_plots <- function(){
-# ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity",
-#           color = "VisitDay",
-#           ylab = "Complexity", xlab = "Month (DRH)")
-# }
-# 
-
-
-
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index a639574..27db4af 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -13,19 +13,28 @@
 # 2) Threading the data by  adding ThreadNum and  SeqNum to  the  threads
 #
 ##########################################################################################################
-
+library(tidyr)
+library(data.table)
+library(dplyr)
+library(ThreadNet)
+library(ngram)
+library(lubridate)
 
 redo_ACHR_data_from_scratch  <- function(fname){
 
+  fname = 'audit_111818'
+  
   # Pick you point of view, or multiple POV...
-  THREAD_CF = c('VISIT_ID')
+  THREAD_CF = c('Visit_ID')
   EVENT_CF = c('Action','Role','Workstation')
+  ALL_CF = c('Action','Role','Workstation')
   
   #  first read the data
   o = read_ACHR_data( fname )
   
-  # Thread occurrences for each POV  
-  occ = thread_occurrences( o,  THREAD_CF, EVENT_CF ,fname )
+  # Thread occurrences  adds threadNum and seqNum to each thread, as defined by the the thread_CF.
+  # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role)
+  occ = thread_occurrences( o,  THREAD_CF ,fname )
   
   #  Now aggregate by clinic-day for  each POV
   clinic_days = ACHR_batch_clinic_days(occ, THREAD_CF, EVENT_CF, fname)
@@ -33,6 +42,9 @@ redo_ACHR_data_from_scratch  <- function(fname){
   # Aggregate by visit for each POV
   visits = ACHR_batch_visits(occ, THREAD_CF, EVENT_CF, fname)
     
+  # compute  trajectories to see difference from a reference graph
+  traj = graph_trajectory( occ, 'Clinic_ymd', EVENT_CF, 2, 1, 0, 'Clinic_trajectory')
+  
 }
 
 # this function reads the raw data from URMC
@@ -40,24 +52,17 @@ redo_ACHR_data_from_scratch  <- function(fname){
 # Tested Nov 16.
 read_ACHR_data <- function(fname){
 
-  library(tidyr)
-  library(data.table)
-  library(dplyr)
-  library(ThreadNet)
-  library(ngram)
-  library(lubridate)
-  library(anytime)
+
   
   
   # This code is tailored for reading in new data from URMC, October 2018
   # Assumes there  is a column  called 'Timestamps" in column 13 and  "Visit_ID"
-  # read the file into data frame
+  # read the file into data.table
    d <-   fread( paste0(fname, '.csv') )
   
-  # Sort by visit and timestamps
+  # Sort by visit and timestamps (will do this later with setkey)
    d  <- arrange(d,desc(Visit_ID,asc(Timestamps)))
-  
-  ##################################################################################
+
   # This  stuff  is URMC specific 
   #
   # move and rename the columns, as needed
@@ -68,9 +73,13 @@ read_ACHR_data <- function(fname){
   setnames(d,'Timestamps','tStamp')
   setnames(d,'V1','seqn')
 
-  # This converts numbers to char and replaces spaces with underscore
-   d = cleanOccBatch(d)
+   # Fix the name of the Highland  clinic
+   d$Clinic = gsub('HH POB','HHPOB',d$Clinic)
+  
   
+  # This converts numbers to char and replaces spaces with underscore
+    d = cleanOccBatch(d)
+ 
     ### This is URMC specific: Add the clinic and clinic_day  and fix the code for the highland  clinic
    
    # get the date only from the timestamp
@@ -79,11 +88,11 @@ read_ACHR_data <- function(fname){
    # make new columns for clinic + day and events
    d = unite(d, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
    
-   # Fix the name of the Highland  clinic
-   d$Clinic = gsub('HH_POB','HHPOB',d$Clinic)
    
   # Save the result   
-  save(d, file=paste0(fname, '.rData')) 
+  save_file_name = paste0(fname, '.rData')
+  save(d, file=save_file_name) 
+  print(paste("Saved ",nrow(d), " occurrences in ",save_file_name))
   
   return(d)
   
@@ -101,7 +110,7 @@ cleanOccBatch <- function(fileRows){
   cleanedCF <- data.frame(lapply(fileRows[2:ncol(fileRows)], function(x){  gsub(" ","_",x)  }) )
   
   # bind tStamp back to cleaned data
-  complete <- cbind(tStamp,cleanedCF)
+  complete <- as.data.table(cbind(tStamp,cleanedCF))
   
   # Old code  for forcing tStamp into a "YMD_HMS" format -- not  needed  for  URMC  EMR  data
    # complete$tStamp <- as.character(complete$tStamp)
@@ -155,16 +164,18 @@ thread_occurrences <- function(occ, THREAD_CF, fname='emr'){
   # sapply(seq(length(idx_list)), function(x)
     for (x in seq(length(idx_list)))
     {   if (x %% 1000 ==0) print(paste0('Thread count =  ',x))
-     occ[start_idx[x]:end_idx[x],'threadNum'] = x
-     occ[start_idx[x]:end_idx[x],'seqNum'] = c(1:(end_idx[x]-start_idx[x]+1))
+     occ[start_idx[x]:end_idx[x],'threadNum' := x ]
+     occ[start_idx[x]:end_idx[x],'seqNum':= c(1:(end_idx[x]-start_idx[x]+1)) ]
     }
   
   # Now save the result for  later and return it, as well. 
   #  save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata'))
-  save(occ, file=paste0(paste(fname,new_thread_col,sep='+'), '.Rdata'))
   
-  print(paste('Saved threaded  occurrences: ', nrow(occ)))
+  save_file_name = paste0(paste0(paste(fname,new_thread_col,sep='+'), '.rData'))
+  save(occ, file=save_file_name)
   
+  print(paste('Saved threaded occurrences: ', nrow(occ), " in ",save_file_name))
+
   return(occ)
 }
 
diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R
index e203b73..5093f1e 100644
--- a/Data Prep/ACHR_Clinic_Trajectories_v3.R	
+++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R	
@@ -31,7 +31,7 @@ library(zoo)
 # Use this to put the trajectory data with the clinic_day data  
 merge_clinic_days_and_trajectories <- function(){
   
-  clinic_days$Clinic_date = gsub('HH_POB','HHPOB',clinic_days$Clinic_date)
+  # clinic_days$Clinic_date = gsub('HH_POB','HHPOB',clinic_days$Clinic_date)
   cds=merge(x=test0,y=clinic_days,by.x='Clinic_ymd',by.y ='Clinic_date' ,all=TRUE) 
   
 }
@@ -54,34 +54,28 @@ get_bucket <- function(o, b ){
 
 
 # @param e data frame for POV
-# @param bucketCF is the column used to define the bucket list.  Each bucket is a "window"
+# @param bucket_CFs is the column used to define the bucket list, or the combination of columns needed to create the bucket.  Each bucket is a "window"
 # @param cf is the column that defines events (e.g. 'Action_Role')
 # @param filter list contains the threshold level  for filtering out edges with low frequency.
 #
-# need to make sure this works with roles over time... 
+# MAKE SURE YOU ARE  READING IN A SET OF OCCURRENCES THAT HAS THE CORRECT THREADS... 
 #
-graph_trajectory_filtered  <- function(e, bucketCF, cf, n_gram_size=2, reference_day=1, filter_list=0,save_file_name) {
+graph_trajectory  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') {
   
-  # get total size of possible matrix
-  Max_Order = length(unique(e[[cf]]))^2
-  
-  
-  # make data frame for results - bid is the bucket_ID
-  vt=data.frame( ngrams=character(), freq=integer(), bid=integer() )
-  
-  # here is the list of buckets
-  blist = unique(e[[bucketCF]])
-  
-  # take the underscore  out so we can use it to strsplit below... and sort it
-  blist=gsub('HH_POB','HHPOB',blist)
-  blist = sort(blist)
+  # add column for bucket_CF if needed
+  bucket_col = newColName(bucket_CFs)
+  if  (!(bucket_col  %in% colnames(e)))  {  e = combineContextFactors(occ,bucket_CFs,bucket_col) }
   
+  # here is the sorted list of buckets
+  blist = sort(unique(e[[bucket_col]]))
+
   # now many buckets?
   nb = length(blist)
-  print(blist)
-  print(paste('nb =',nb))
+  # print(blist)
+  print(paste('Number of buckets =',nb))
   
-  complexity_idx=list(nb)
+  # make data frame for results
+  vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() )
   
   bcount=0
   # scan through the data
@@ -92,8 +86,8 @@ graph_trajectory_filtered  <- function(e, bucketCF, cf, n_gram_size=2, reference
      # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE...
      # get text vector for the whole data set.  Bucket needs at least 2 threads
     # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
-      th = get_bucket(e, b)
-      if (nrow(th)>2) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]  }  
+      th = e[ e[[bucket_col]] ==b , ] 
+      if (nrow(th)>3) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]  }  
       
       
       nodes = length(unique(th[[cf]]))
@@ -103,13 +97,12 @@ graph_trajectory_filtered  <- function(e, bucketCF, cf, n_gram_size=2, reference
     
     # add the bucket number and name
     ngdf$bid = bcount
-    # ngdf$bname = b
+    ngdf$bname = b
     
     # append the columns to the end
     # vt is the whole set of all ngrams in all the windows
     vt=rbind(vt,ngdf)
   }
-   print(head(complexity_idx))
 
    
   # convert to factor
@@ -145,7 +138,7 @@ graph_trajectory_filtered  <- function(e, bucketCF, cf, n_gram_size=2, reference
     # fm = windowFreqMatrix/max(windowFreqMatrix)
     # 
     # # if the number if below the threshold, set it to zero
-    # windowFreqMatrix[fm<=f] = 0
+    # windowFreqMatrix[fm<=filter_threshold] = 0
 
     df = data.frame( t(sapply( 1:(nWindows-1), 
                                   function(i){
@@ -154,25 +147,26 @@ graph_trajectory_filtered  <- function(e, bucketCF, cf, n_gram_size=2, reference
                                     Clinic = unlist(strsplit(blist[i],'_'))[1],
                                     ymd =  unlist(strsplit(blist[i],'_'))[2],
                                     pct_retained = sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0),
-                                    pct_possible = sum(windowFreqMatrix[i,] > 0)/Max_Order,
                                     complexity = estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ),
                                     Dist_from_reference = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ),
                                     Dist_from_next = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' )
                                     )
                                   })))
   
-    ##### maybe use merge to get other variables included?  Or do that outside the function...
-  
+    
+ # get the ngram data and labels
+    if (keep_ngram_vectors) {
+            b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),])
+            colnames(b_df)=vt_unique$ngrams
+  
+            # stick the ngram frequencies on the end for good measure
+            df = cbind(df,b_df)
+    }
+    
+  # save the result and return it, too
   save(df, file=paste0(save_file_name,'.rData') )
   
- return(df)
-  
-  # get the ngram data and labels
-  # b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),])
-  # colnames(b_df)=vt_unique$ngrams
-  # 
-  # # stick the ngram frequencies on the end for good measure
-  # return(cbind(df,b_df))
+  return(df)
 
 }
 
diff --git a/Data Prep/Comparison_functions.R b/Data Prep/Comparison_functions.R
index ba6c6d0..7d94256 100644
--- a/Data Prep/Comparison_functions.R	
+++ b/Data Prep/Comparison_functions.R	
@@ -9,91 +9,7 @@
 
 library(dplyr)
 
-get_date_subsets <-function(c) {
-b1 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06')
-b2 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01')
-b3 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15')
-b4 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01')
-b5 <<- new_occ_VR %>% filter(Clinic==eval(c) & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31')
 
-rc1 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06')
-rc2 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01')
-rc3 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15')
-rc4 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01')
-rc5 <<- new_occ_VR %>% filter(Clinic=='REDCK' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31')
-
-smh1 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06')
-smh2 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01')
-smh3 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15')
-smh4 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01')
-smh5 <<- new_occ_VR %>% filter(Clinic=='SMH' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31')
-
-hh1 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06')
-hh2 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01')
-hh3 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15')
-hh4 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01')
-hh5 <<- new_occ_VR %>% filter(Clinic=='HHPOB' & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31')
-
-}
-
-# c is for the clinic 
-# m is the metric
-compare_periods <-function(c,m){
-
-  # b1 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-01-01' & as.Date(tStamp) < '2016-06-06')
-  # b2 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-06-8' & as.Date(tStamp) < '2016-09-01')
-  # b3 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2016-09-02' & as.Date(tStamp) < '2017-04-15')
-  # b4 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2017-04-15' & as.Date(tStamp) < '2017-09-01')
-  # b5 <- new_occ_VR %>% filter(Clinic==c & as.Date(tStamp) > '2017-09-02' & as.Date(tStamp) < '2017-12-31')
-  
-  
-  a1=routine_metric(b1,'Action',m )
-  a2=routine_metric(b2,'Action',m )
-  a3=routine_metric(b3,'Action',m )
-  a4=routine_metric(b4,'Action',m )
-  a5=routine_metric(b5,'Action',m )
-  
-  r1=routine_metric(b1,'Role',m )
-  r2=routine_metric(b2,'Role',m )
-  r3=routine_metric(b3,'Role',m )
-  r4=routine_metric(b4,'Role',m )
-  r5=routine_metric(b5,'Role',m )
-  
-  ws1=routine_metric(b1,'Workstation',m )
-  ws2=routine_metric(b2,'Workstation',m )
-  ws3=routine_metric(b3,'Workstation',m )
-  ws4=routine_metric(b4,'Workstation',m )
-  ws5=routine_metric(b5,'Workstation',m )
-  
-  rc_a1=routine_metric(rc1,'Action',m )
-  rc_a2=routine_metric(rc2,'Action',m )
-  rc_a3=routine_metric(rc3,'Action',m )
-  rc_a4=routine_metric(rc4,'Action',m )
-  rc_a5=routine_metric(rc5,'Action',m )
-  
-  smh_a1=routine_metric(smh1,'Action',m )
-  smh_a2=routine_metric(smh2,'Action',m )
-  smh_a3=routine_metric(smh3,'Action',m )
-  smh_a4=routine_metric(smh4,'Action',m )
-  smh_a5=routine_metric(smh5,'Action',m )
-  
-  hh_a1=routine_metric(hh1,'Action',m )
-  hh_a2=routine_metric(hh2,'Action',m )
-  hh_a3=routine_metric(hh3,'Action',m )
-  hh_a4=routine_metric(hh4,'Action',m )
-  hh_a5=routine_metric(hh5,'Action',m )
-  
-  lex=data.frame(c(a1,a2,a3,a4,a5),
-                 c(hh_a1,hh_a2,hh_a3,hh_a4,hh_a5),
-                 c(rc_a1,rc_a2,rc_a3,rc_a4,rc_a5),
-                 c(smh_a1,smh_a2,smh_a3,smh_a4,smh_a5)
-                 )
-  
-  ws1=ws1[ws1>0]/nrow(b1)
-  ws2=ws2[ws2>0]/nrow(b2)
-  ws3=ws3[ws3>0]/nrow(b3)
-
-}
 
 
 # o is the list of occurrences
diff --git a/Stats/precedence_functions.R b/Stats/precedence_functions.R
new file mode 100644
index 0000000..888851e
--- /dev/null
+++ b/Stats/precedence_functions.R
@@ -0,0 +1,38 @@
+
+# need to test if this is working correctly
+# It should be counting the number of times that each element in the lexicon precedes each other element.
+
+# make some data
+S=lapply(1:10,function(x) {c('a','b','a','c',sample(letters[5:6],1+round(runif(1)*10),replace=TRUE)) })
+
+# take list of sequences, S
+precedes <- function(S){
+  
+  # Get lexicon
+  lex = sort(unique(unlist(S)))
+  
+  # for each element in the lexicon...
+  result = matrix(data = sapply(lex, function(a){
+    
+      # Find its position(s) in each sequence. pos is a list of vectors
+      pos=sapply(S, function(s) {grep(a,s)})
+    
+      # now for the last position in each string, find the lexical elements that precede it 
+      # get the substrings
+      subS = mapply(function(p,s){if (max(p)>1) s[1:max(p)]},pos, S)
+ 
+      # Now find all of the other lexical elements that precede a in each subS
+      colSums(sapply(lex,function(x){str_detect(subS,x)})) }
+      ) , 
+      
+      ncol=length(lex))
+
+  rownames(result) = lex
+  colnames(result) = lex
+  
+  # normalize by the number of sequences,so "always" = 1
+  result = result/length(S)
+  
+  return(result)
+  
+}
\ No newline at end of file

From fd42ea4ac6690c296961d8900f9f197e40931f2c Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Sat, 1 Dec 2018 08:02:45 -0500
Subject: [PATCH 17/31] Added comments on next version...

---
 Data Prep/ACHR_Batch_Aggregate_Data.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index cfc7ea5..464b2b1 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -14,6 +14,11 @@
 # 1) By thread (usually visit, but could be visit_role)
 # 2) By Clinic_day
   
+# Rewrite for speed: 
+# a) go back to for loop, but just for the stuff that is computed based on the df
+# b) grab first row of each thread seqNum ==1 from the table
+# c) merge them back  together
+# d) Probably need to do different versions for visit vs. visit_role because a lot of the parameters make no sense...
 
 ###################################################################
 ###################################################################

From 8f256b05b60d2d27ac664b9427b4addf68a1a4a0 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Sat, 8 Dec 2018 14:37:26 -0500
Subject: [PATCH 18/31] Finished updating code for aggregating. Added code for
 Team Documenter analysis...

---
 Data Prep/ACHR_Batch_Aggregate_Data.R   | 808 +++++++++++++++++-------
 Data Prep/ACHR_Clinic_Trajectories_v3.R | 154 ++++-
 Tables and Figures/Clinic_day_plots.R   | 110 ++++
 3 files changed, 845 insertions(+), 227 deletions(-)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index 464b2b1..4bbd456 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -30,9 +30,8 @@
 # ALL_CFs are used to computer the CF_alignment
 
 
-#  need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time.
 
-ACHR_batch_threads <- function(occ,THREAD_CFs, EVENT_CFs, ALL_CFs) {
+ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) {
   
   library(tidyr)
   library(dplyr)
@@ -42,121 +41,325 @@ ACHR_batch_threads <- function(occ,THREAD_CFs, EVENT_CFs, ALL_CFs) {
   library(stringr)
   library(data.table)
   
+  
+  # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic
+  firstOcc=occ[occ$seqNum==1,]
+  
+  # always used threadNum for the TN
+  TN = 'threadNum'
+  
   # Add  columns for combinations of CFs if needed
-  thread_col = newColName(THREAD_CFs)
-  TN = thread_col
-  if  (!(thread_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,THREAD_CFs,thread_col) }
-
+  # don't need to add new column for the thread_CFs.  That one has to be here.
+  # thread_col = newColName(THREAD_CFs)
+  # TN = thread_col
+  # if  (!(thread_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,THREAD_CFs,thread_col) }
   
   new_event_col = newColName(EVENT_CFs)
   if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
- 
+  
   all_cf_col = newColName(ALL_CFs)
   if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
   
-  # set key on the data.table
+  # set key on the data.table for the threadNum
+  occ=as.data.table(occ)
   setkeyv(occ, TN)
   
   # get the list of buckets
-  bucket_list <- unique(occ[[TN]])
+  bucket_list <- sort( unique(occ[[TN]]) )
   
-  # print the number of buckets
-  print(paste0('Number of buckets=', length(bucket_list)  ))
+  # get the number of buckets
+  N = length(bucket_list)
+  print(paste0('Number of buckets=', N  ))
   
+  # allocate the data table with N rows, then for speed, we use := to update it.
+  ACHR = data.table(threadNum =  integer(N),
+                    Phase  = character(N), 
+                    NEvents = integer(N),
+                    ThreadDuration =double(N),
+                    VisitDuration= double(N),
+                    wait_time1 = double(N),
+                    wait_time2 = double(N),
+                    NetComplexity=double(N),
+                    Nodes=double(N),
+                    Edges=double(N),
+                    CompressRatio = double(N),
+                    Entropy = double(N),
+                    NumProcedures = double(N),
+                    NumDiagnoses = double(N),
+                    ALL_CF_count =  integer(N),
+                    ALL_CF_entropy = double(N),
+                    CF_Alignment = double(N) )
+
+    # Now add columns for the CF counts.  
+    for (cf in ALL_CFs){
+      ACHR[, paste0(cf,"_count"):= double(N)]
+      # ACHR[, paste0(cf,"_compression"):= double(N)]
+      # ACHR[, paste0(cf,"_entropy"):= double(N)]
+    }
   
-  # make data frame with results
- Thrds = data.table( t(sapply( bucket_list, 
-                           function(b){
-                             
-                             # select a subset 
-                             df= occ[get(TN)==b]
-                    
-                            # make sure it  is sorted by timestamp
-                             df=df[order(df$tStamp),]
-                             
-                             # get the network -- only if there are enough rows...
-                             
-                              if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
-                              else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
-                             
-                             # compute each parameter and put them in a vector
-                             c(
-                               bucket=b,
-                               Clinic = as.character(df$Clinic[1]),
-                               Physician  = as.character(df$Physician[1]),
-                               ymd = as.character(df$ymd[1]),
-                               Clinic_ymd = as.character(df$Clinic_ymd[1]),
-                               Weekday  = as.character(df$Weekday[1]),
-                               Month  = as.character(df$Month[1]),
-                               Phase =  compute_phase(df$tStamp[1]),
-                               threadNum = as.numeric(df$threadNum[1]),
-                               Visit_ID  = as.character(df$Visit_ID[1]),
-                               Subject_ID  = as.character( df$Subject_ID[1] ),
-                               Role_ID = as.character( df$Role_ID[1] ),
-                               NEvents = nrow(df),
-                               ThreadStart= as.character(df$tStamp[1] ),
-                               ThreadDuration= compute_thread_duration(df), 
-                               VisitDuration= compute_visit_duration(df), 
-                               wait_time1 = compute_wait_time1(df),
-                               wait_time2 = compute_wait_time2(df),
-                               Visit_number = as.numeric(df[1,'Visitnum_Total']),
-                               LOC_CPT =  as.character(df[1,'LOS_CPT']),
-                               NetComplexity=estimate_network_complexity( n ),
-                               Nodes=nrow(n$nodeDF),
-                               Edges=nrow(n$edgeDF),
-                               CompressRatio = compression_index(df,new_event_col),
-                               Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]),
-                               NumProcedures = count_procedures(df$Proc[1]),
-                               NumDiagnoses = count_diagnoses(df$Diag[1]),
-                               Proc = as.character(df[1,'Proc']),
-                               Diagnosis = as.character(df[1,'Diag']),    
-                               Diagnosis_group  = as.character(df[1,'Diagnosis_Group']),
-                               CF_Alignment = 1,    # make placeholder, but compute below
-                               ALL_CF_count = length(unique(df[[all_cf_col]])),
-                               ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), 
-                             
-                             # name these columns afterwards
-                             sapply(ALL_CFs, function(cf){
-                               c( length(unique(df[[cf]]))
-                                  # ,
-                                  # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) 
-                                  ) })
-                             
-                             )
-                      } )))
- 
- 
- # name the last columns -- code has to match above
- cn =  as.vector(sapply(ALL_CFs, function(cf){
-   c( paste0(cf,"_count")
-    #   ,  
-    # paste0(cf,"_entropy") 
-    ) }))
- 
- # now assign them to the last columns
- last_col = ncol(Thrds)
- first_col= last_col-length(cn)+1
- setnames(Thrds, c(first_col:last_col), cn)
- 
- # Compute the alignment of the context factors
- Thrds$CF_Alignment =  as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count ))
- 
- save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')
- save(Thrds, file=save_file_name)
- 
- print(paste('Saved ', nrow(Thrds), " records in ",save_file_name))
- 
- return(Thrds) 
+  # update data table with results - only compute the ones that require looking at the whole thread
+  for (b in 1:N){
+    
+    if (b %% 1000 ==0) print(paste0('Thread count =  ',b))
+    
+    # select a subset 
+    df= occ[get(TN)==bucket_list[b]]
+    
+    # make sure it is sorted by timestamp
+    df=df[order(df$tStamp),]
+    
+    # get the network -- only if there are enough rows...
+    if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+    else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
+    
+    # compute each parameter and update the table
+    ACHR[b,threadNum := bucket_list[b] ]
+    ACHR[b,Phase :=  compute_phase(df$tStamp[1])]
+    ACHR[b,NEvents := nrow(df) ]
+    ACHR[b,ThreadDuration:= compute_thread_duration(df) ] 
+    ACHR[b,VisitDuration:= compute_visit_duration(df) ] 
+    ACHR[b, wait_time1 := compute_wait_time1(df) ]
+    ACHR[b, wait_time2 := compute_wait_time2(df) ]
+    ACHR[b, NetComplexity:=estimate_network_complexity( n ) ]
+    ACHR[b, Nodes:=nrow(n$nodeDF) ]
+    ACHR[b,Edges:=nrow(n$edgeDF) ]
+    ACHR[b,CompressRatio := compression_index(df,new_event_col) ]
+    ACHR[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+    ACHR[b,NumProcedures := count_procedures(df$Proc[1]) ]
+    ACHR[b,NumDiagnoses := count_diagnoses(df$Diag[1]) ]
+    ACHR[b,ALL_CF_count := length(unique(df[[all_cf_col]])) ]
+    ACHR[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
+    
+    # Count the unique elements in each cf  
+    for (cf in ALL_CFs){  ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ] }
+  } 
+  
+  # Compute the alignment of the context factors
+  ACHR$CF_Alignment =  as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count ))
+  
+  # Merge the results with the first row from each thread
+  print('Merging results...')
+  Thrds=merge(x=ACHR, y=firstOcc, by.x='threadNum', by.y ='threadNum' ,all=TRUE) 
   
+  save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')
+  save(Thrds, file=save_file_name)
+  
+  print(paste('Saved ', nrow(Thrds), " records in ",save_file_name))
+  
+  return(Thrds) 
 }
 
+#################################################################################
+# streamlined version for threads that are PART OF A VISIT.  So you can merge with visits
+# to get the visit context.  Just compute the minimum for speed.
+ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) {
+  
+  library(tidyr)
+  library(dplyr)
+  library(ThreadNet)
+  library(ngram)
+  library(lubridate)
+  library(stringr)
+  library(data.table)
+  
+  
+  # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic
+ # firstOcc=occ[occ$seqNum==1,]
+  
+  # always used threadNum for the TN
+  TN = 'threadNum'
+  
+  # Add  columns for combinations of CFs if needed
+  # don't need to add new column for the thread_CFs.  That one has to be here.
+  # thread_col = newColName(THREAD_CFs)
+  # TN = thread_col
+  # if  (!(thread_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,THREAD_CFs,thread_col) }
+  
+  new_event_col = newColName(EVENT_CFs)
+  if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+  
+  all_cf_col = newColName(ALL_CFs)
+  if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
+  
+  # set key on the data.table for the threadNum
+  occ=as.data.table(occ)
+  setkeyv(occ, TN)
+  
+  # get the list of buckets
+  bucket_list <- sort( unique(occ[[TN]]) )
+  
+  # get the number of buckets
+  N = length(bucket_list)
+  print(paste0('Number of buckets=', N  ))
+  
+  # allocate the data table with N rows, then for speed, we use := to update it.
+  ACHR = data.table(Visit_ID  = character(N), 
+                    Role_VR = character(N), 
+                    Role_ID_VR =  character(N), 
+                    threadNumVR =  integer(N),
+                    NEventsVR = integer(N),
+                    threadStartVR =  character(N), 
+                    ThreadDurationVR =double(N),
+                    NetComplexityVR=double(N),
+                    NodesVR=double(N),
+                    EdgesVR=double(N),
+                    CompressRatioVR = double(N),
+                    EntropyVR = double(N),
+                    ALL_CF_countVR =  integer(N),
+                    ALL_CF_entropyVR = double(N) )
+  
+  # Now add columns for the CF counts.  
+  for (cf in ALL_CFs){
+    ACHR[, paste0(cf,"_countVR"):= double(N)]
+    # ACHR[, paste0(cf,"_compression"):= double(N)]
+    # ACHR[, paste0(cf,"_entropy"):= double(N)]
+  }
+  
+  # update data table with results - only compute the ones that require looking at the whole thread
+  for (b in 1:N){
+    
+    if (b %% 1000 ==0) print(paste0('Thread count =  ',b))
+    
+    # select a subset 
+    df= occ[get(TN)==bucket_list[b]]
+    
+    # get the network -- only if there are enough rows...
+    if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+    else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
+    
+    # compute each parameter and update the table
+    ACHR[b,Visit_ID := as.character(df$Visit_ID[1]) ]
+    ACHR[b,Role_VR:= as.character(df$Role[1]) ]
+    ACHR[b,Role_ID_VR:= as.character(df$Role_ID[1]) ]
+    ACHR[b,threadNumVR := bucket_list[b] ]
+    ACHR[b,NEventsVR := nrow(df) ]
+    ACHR[b,threadStartVR := as.character(min(lubridate::ymd_hms(df$tStamp))) ]
+    ACHR[b,ThreadDurationVR:= difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours' ) ] 
+    ACHR[b, NetComplexityVR:=estimate_network_complexity( n ) ]
+    ACHR[b, NodesVR:=nrow(n$nodeDF) ]
+    ACHR[b,EdgesVR:=nrow(n$edgeDF) ]
+    ACHR[b,CompressRatioVR := compression_index(df,new_event_col) ]
+    ACHR[b,EntropyVR := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+    ACHR[b,ALL_CF_countVR := length(unique(df[[all_cf_col]])) ]
+    ACHR[b,ALL_CF_entropyVR := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
+    
+    # Count the unique elements in each cf  
+    for (cf in ALL_CFs){  ACHR[b, paste0(cf,"_countVR") :=  length(unique(df[[cf]])) ] }
+  } 
+  
+  # Compute the alignment of the context factors
+ #  ACHR$CF_AlignmentVR =  as.numeric( as.character(ACHR$Action_countVR)) / as.numeric( as.character(ACHR$ALL_CF_countVR ))
+  
+  # Merge the results with the first row from each thread
+  print('Merging results...')
+  VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID' ,all=TRUE) 
+  
+  save_file_name = paste0(paste('VRThrds',TN,new_event_col,sep='+'), '.Rdata')
+  save(VRThrds, file=save_file_name)
+  
+  print(paste('Saved ', nrow(VRThrds), " records in ",save_file_name))
+  
+  return(VRThrds) 
+}
 
-
-get_timeScale <- function(){'hr'}
-
-
-
-# Need to write all  these functions
+#################################################################################
+# streamlined version for adding or recomputing columns that describe a visit.  
+# So you can merge with visits to get rest of the columns.  Just compute the minimum for speed.
+# Can re-write this as needed... 
+ACHR_batch_visit_add_columns <- function(occ, EVENT_CFs, visits) {
+  
+  library(tidyr)
+  library(dplyr)
+  library(ThreadNet)
+  library(ngram)
+  library(lubridate)
+  library(stringr)
+  library(data.table)
+  
+  
+  # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic
+  # firstOcc=occ[occ$seqNum==1,]
+  
+  # always used threadNum for the TN
+  TN = 'threadNum'
+  
+  # Add  columns for combinations of CFs if needed
+  # don't need to add new column for the thread_CFs.  That one has to be here.
+  # thread_col = newColName(THREAD_CFs)
+  # TN = thread_col
+  # if  (!(thread_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,THREAD_CFs,thread_col) }
+  
+  new_event_col = newColName(EVENT_CFs)
+  if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+  
+ 
+  # set key on the data.table for the threadNum
+  occ=as.data.table(occ)
+  setkeyv(occ, TN)
+  
+  # get the list of buckets
+  bucket_list <- sort( unique(occ[[TN]]) )
+  
+  # get the number of buckets
+  N = length(bucket_list)
+  print(paste0('Number of buckets=', N  ))
+  
+  # allocate the data table with N rows, then for speed, we use := to update it.
+  ACHR = data.table(Visit_ID  = character(N), 
+                    new_wait_time1 = double(N),
+                    new_wait_time2 = double(N) )
+                    
+                    # Role_VR = character(N), 
+                    # Role_ID_VR =  character(N), 
+                    # threadNumVR =  integer(N),
+                    # NEventsVR = integer(N),
+                    # threadStartVR =  character(N), 
+                    # ThreadDurationVR =double(N),
+                    # NetComplexityVR=double(N),
+                    # NodesVR=double(N),
+                    # EdgesVR=double(N),
+                    # CompressRatioVR = double(N),
+                    # EntropyVR = double(N),
+                    # ALL_CF_countVR =  integer(N),
+                    # ALL_CF_entropyVR = double(N) )
+  
+  
+  # update data table with results - only compute the ones that require looking at the whole thread
+  for (b in 1:N){
+    
+    if (b %% 1000 ==0) print(paste0('Thread count =  ',b))
+    
+    # select a subset 
+    df= occ[get(TN)==bucket_list[b]]
+    
+    # get the network -- only if there are enough rows...
+    # if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+    # else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
+    # 
+    # compute each parameter and update the table
+    ACHR[b,Visit_ID := as.character(df$Visit_ID[1]) ]
+    ACHR[b, new_wait_time1 := compute_wait_time1(df) ]
+    ACHR[b, new_wait_time2 := compute_wait_time2(df) ]
+  } 
+  
+  
+  # Compute the alignment of the context factors
+  #  ACHR$CF_AlignmentVR =  as.numeric( as.character(ACHR$Action_countVR)) / as.numeric( as.character(ACHR$ALL_CF_countVR ))
+  
+  # Merge the results with the first row from each thread
+  print('Merging results...')
+ # VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID' ,all=TRUE) 
+  VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID') 
+  
+  save_file_name = paste0(paste('VRThrds',TN,new_event_col,sep='+'), '.Rdata')
+  save(VRThrds, file=save_file_name)
+  
+  print(paste('Saved ', nrow(VRThrds), " records in ",save_file_name))
+  
+  return(VRThrds) 
+}
+##################################################################################
+# Helper functions
 compute_phase <- function(t){
   
   t=as.Date(t)
@@ -193,64 +396,105 @@ compute_wait_time1 <- function(df){
   
   #  find the first  occurrence of MR_VN_VITALS. Corresponds closely to wait time
  # w= grep('MR_VN_VITALS',df$Action)[1]
-  w= grep('VITALS',df$Action)[1]
   
-  return( difftime( lubridate::ymd_hms(df$tStamp[w]),lubridate::ymd_hms(df$tStamp[1]),  units='hours' ) )
+  w1= grep('CHECKIN_TIME',df$Action)[1]
+  w2= grep('VITALS',df$Action)[1]
+  
+  return( max(0, as.numeric(difftime( lubridate::ymd_hms(df$tStamp[w2]),lubridate::ymd_hms(df$tStamp[w1]),  units='hours' ) )))
    
 }
 
 # Second wait time... from  vitals until they see a doctor or other medical person
+# NEW VERSION -- look for "AC_VISIT_NAVIGATOR" by a different role than "VITALS"
 compute_wait_time2 <- function(df){
   
   #  find the first  occurrence of MR_VN_VITALS. Get the time t1
- # w1= grep('MR_VN_VITALS',df$Action)[1]
+  # w1= grep('MR_VN_VITALS',df$Action)[1]
   w1= grep('VITALS',df$Action)[1]
   
   if (is.na(w1))
     return(NA)
   else
   {
-  t1 = lubridate::ymd_hms(df$tStamp[w1])
-  
-  
-  # get the workstation and role used for the vitals at t1
-  work1 = df$Workstation[w1]
-  role1 = df$Role[w1]
-  
-  # get set of actions at the same workstation or different workstations that happen later in the visit
-  # Truncate the visit up to w1
-  restdf = df[w1:nrow(df),]
-
-  # then get the set of action at the same workstation or different workstations (in that visit)
-  sameWS =  restdf[restdf$Workstation == work1,]
-  differentWS =  restdf[restdf$Workstation != work1,]
-  
-  # get list of other roles
-  otherRoles = paste(setdiff(unique(restdf$Role),c('Technician','Technologist','Staff','Unknown')),
-                    collapse = '|')
-  
-  # Now find the marker for when they see a physician
-  # procedure is differenrt in BRKPT and HHPOB clinics
-  if (df$Clinic[1]  %in% c('BRKPT','HHPOB') )
-    { # LPN brings a different workstation on wheels at these clinics  
-    firstLPN =  grep('LPN',differentWS$Role)[1] 
-    t2 = lubridate::ymd_hms(differentWS$tStamp[firstLPN])
+    t1 = lubridate::ymd_hms(df$tStamp[w1])
     
-    # Other roles use the same workstation
-    firstDoc =  grep('Physician|Resident|Registered', sameWS$Role)[1] 
-    t2 = lubridate::ymd_hms(sameWS$tStamp[firstDoc])
-    }
-  else 
-  {  # a different role, using the same workstation 
-    w2 = max(w1, grep(otherRoles, sameWS$Role) )[1]
-    t2 = lubridate::ymd_hms(sameWS$tStamp[w2])
-    }
     
-  # return the difference in time from t1 to t2
-  return( difftime( t2, t1,  units='hours' ) )
+    # get the workstation and role used for the vitals at t1
+    # work1 = df$Workstation[w1]
+    role1 = df$Role[w1]
+    
+    # get set of actions at the same workstation or different workstations that happen later in the visit
+    # Truncate the visit up to w1
+    restdf = df[(w1+1):nrow(df),]
+    
+    # get the actions performed by other roles in the rest of the visit
+    differentRole =  restdf[restdf$Role != role1,]
+    
+    # Now look  for first instance of  AC_VISIT_NAVIGATOR that is by one of the other roles
+       # first_AC_NAV =  grep('AC_VISIT_NAVIGATOR',differentRole$Action)[1]
+       # t2 = lubridate::ymd_hms(differentRole$tStamp[first_AC_NAV])
+       
+     # Try first instance of different role. period.  
+      t2 = lubridate::ymd_hms(differentRole$tStamp[1])
+      
+      
+    # return the difference in time from t1 to t2
+    return( max(0, as.numeric(difftime( t2, t1,  units='hours' ) )))
   }
 }
 
+# Second wait time... from  vitals until they see a doctor or other medical person
+# compute_wait_time2 <- function(df){
+#   
+#   #  find the first  occurrence of MR_VN_VITALS. Get the time t1
+#  # w1= grep('MR_VN_VITALS',df$Action)[1]
+#   w1= grep('VITALS',df$Action)[1]
+#   
+#   if (is.na(w1))
+#     return(NA)
+#   else
+#   {
+#   t1 = lubridate::ymd_hms(df$tStamp[w1])
+#   
+#   
+#   # get the workstation and role used for the vitals at t1
+#   work1 = df$Workstation[w1]
+#   role1 = df$Role[w1]
+#   
+#   # get set of actions at the same workstation or different workstations that happen later in the visit
+#   # Truncate the visit up to w1
+#   restdf = df[w1:nrow(df),]
+# 
+#   # then get the set of action at the same workstation or different workstations (in that visit)
+#   sameWS =  restdf[restdf$Workstation == work1,]
+#   differentWS =  restdf[restdf$Workstation != work1,]
+#   
+#   # get list of other roles
+#   otherRoles = paste(setdiff(unique(restdf$Role),c('Technician','Technologist','Staff','Unknown')),
+#                     collapse = '|')
+#   
+#   # Now find the marker for when they see a physician
+#   # procedure is differenrt in BRKPT and HHPOB clinics
+#   if (df$Clinic[1]  %in% c('BRKPT','HHPOB') )
+#     { # LPN brings a different workstation on wheels at these clinics  
+#     firstLPN =  grep('LPN',differentWS$Role)[1] 
+#     t2 = lubridate::ymd_hms(differentWS$tStamp[firstLPN])
+#     
+#     # Other roles use the same workstation
+#     firstDoc =  grep('Physician|Resident|Registered', sameWS$Role)[1] 
+#     t2 = lubridate::ymd_hms(sameWS$tStamp[firstDoc])
+#     }
+#   else 
+#   {  # a different role, using the same workstation 
+#     w2 = max(w1, grep(otherRoles, sameWS$Role) )[1]
+#     t2 = lubridate::ymd_hms(sameWS$tStamp[w2])
+#     }
+#     
+#   # return the difference in time from t1 to t2
+#   return( difftime( t2, t1,  units='hours' ) )
+#   }
+# }
+
 
 # THIS ONE IS GOOD.
 count_procedures <- function( p ){
@@ -280,7 +524,19 @@ count_diagnoses <- function(d){
   return( max(1, str_count( d, '#@#' )))
 }
 
+count_daily_procedures <- function( df ) {
+  
+  # get the string of procedures for that day
+  allproc  = unlist(df[df$seqNum==1, 'Proc'])
+  
+  if (length(allproc)>0)  
+    return( sum(sapply(allproc, function(p) {count_procedures(p)} )) )
+  else return(0)
+  
+  
+}
 
+# get_timeScale <- function(){'hr'}
 
 
 
@@ -289,7 +545,6 @@ count_diagnoses <- function(d){
 ###################################################################
 # This is function is set up to aggregate the occurrences among collections of visit -- typically clinic_days
 # occ = pre-processed threaded occurrences
-# TN = threadNum in most cases
 # CFs can be chosen  
 ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
   
@@ -303,9 +558,10 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
   
   # make a list of unique buckets.   
   bucket_list <- unique(occ[[TN]])
+  N =  length(bucket_list) 
   
   # print the number of buckets
-  print(paste0('Number of buckets=', length(bucket_list)  ))
+  print(paste0('Number of buckets=', N  ))
   
   # Add  columns for combinations of CFs if needed
   new_event_col = newColName(EVENT_CFs)
@@ -314,78 +570,82 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
   all_cf_col = newColName(ALL_CFs)
   if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
   
+  # this will speed up retrieving the subsets
+  setkeyv(occ, TN)
+  
+  # allocate the data table with N rows, then for speed, we use := to update it.
+  cds = data.table(Clinic_ymd  = character(N), 
+                    Clinic = character(N),
+                    ymd = character(N),
+                    Phase =  character(N), 
+                    NEvents = integer(N),
+                    ClinicDayStart =  character(N), 
+                    ClinicDayDuration =double(N),
+                    NumVisits =  integer(N),
+                    NumUniqueDiagnosisGroups = integer(N), 
+                    NumPhysicians = integer(N),
+                    TotalStaff = integer(N),
+                    NetComplexity=double(N),
+                    Nodes=double(N),
+                    Edges=double(N),
+                    CompressRatio = double(N),
+                    Entropy = double(N),
+                    ALL_CF_count =  integer(N),
+                    ALL_CF_entropy = double(N),
+                    CF_Alignment  = double(N))
+  
+  # Now add columns for the CF counts.  
+  for (cf in ALL_CFs){
+    cds[, paste0(cf,"_countVR"):= double(N)]
+    # cds[, paste0(cf,"_compression"):= double(N)]
+    # cds[, paste0(cf,"_entropy"):= double(N)]
+  }
+  
   
   # make data frame with results
- cds = data.table( t(sapply( bucket_list, 
-                                function(b){
-                                  
-                                  # select a subset of occurrences for the bucket
-                                  df = occ[ occ[[TN]] ==b , ] 
-                                  
-                                  
-                                  # get the network -- only if there are enough rows...
-                                  if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
-                                  else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
-                                  
-                                  # compute each parameter and put them in a vector
-                                  c(
-                                    bucket=b,
-                                    Clinic = df[1,'Clinic'],
-                                    Physician  = as.character(df[1,'Physician']),
-                                    ymd = df[1,'ymd'],
-                                    Clinic_ymd = df[1,'Clinic_ymd'],
-                                    Weekday  = df[1,'Weekday'],
-                                    Month  = df[1,'Month'],
-                                    Phase =  compute_phase(df$tStamp[1]),
-                                    threadNum = df[1,'threadNum'],
-                                    Visit_ID  = df[1,'Visit_ID'],
-                                    Subject_ID  = df[1,'Subject_ID'],
-                                    NEvents = nrow(df),
-                                    bucketStart= as.character( df[1,'tStamp'] ),
-                                    bucketStartInt = df[1,'tStamp'],
-                                    bucketDuration= difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours' ),
-                                    #   wait_time = compute_wait_time(df),
-                                    #   Visit_number = integer(N),
-                                    #   LOC_CPT =  character(N),
-                                    #  NumUniqueProcedures = length(unique(df[['Proc']])
-                                    NumVisits =  length(unique(df[['Visit_ID']])),
-                                    NumUniqueDiagnosisGroups = length(unique(df[['Diagnosis_Group']])), 
-                                    NumPhysicians = length(unique(df[['Physician']])),
-                                    TotalStaff = length(unique(df[['Role_ID']])),
-                                    NetComplexity=estimate_network_complexity( n ),
-                                    Nodes=nrow(n$nodeDF),
-                                    Edges=nrow(n$edgeDF),
-                                    CompressRatio = compression_index(df,new_event_col),
-                                    Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]),
-                                    # NumProcedures = count_procedures(df$Proc[1]),
-                                    # NumDiagnoses = count_diagnoses(df$Diag[1]),
-                                    # Diagnosis_group  = df[1,'Diagnosis_Group'],
-                                    CF_Alignment = 1,    # make placeholder, but compute below
-                                    ALL_CF_count = length(unique(df[[all_cf_col]])),
-                                    ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), 
-                                    
-                                    # name these columns afterwards
-                                    sapply(ALL_CFs, function(cf){
-                                      c( length(unique(df[[cf]]))
-                                         # ,
-                                         # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) 
-                                         ) })
-                                    
-                                  )
-                                } )))
-  
-  
-  # name the last columns -- code has to match above
-  cn =  as.vector(sapply(ALL_CFs, function(cf){
-    c( paste0(cf,"_count")
-       # ,  
-       # paste0(cf,"_entropy") 
-       ) }))
-  
-  # now assign them to the last columns
-  last_col = ncol(cds)
-  first_col= last_col-length(cn)+1
-  setnames(cds, c(first_col:last_col), cn)
+ for (b in 1:N){
+   
+   
+   if (b %% 100 ==0) print(paste0('Thread count =  ',b))
+   
+   
+   # select a subset of occurrences for the bucket
+   df = occ[ occ[[TN]] == bucket_list[b]  ] 
+  
+  
+  # get the network -- only if there are enough rows...
+  if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+  else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
+  
+  # compute each parameter and put them in a vector
+  
+  cds[b,Clinic_ymd :=  bucket_list[b] ]
+  cds[b,Clinic := as.character( df$Clinic[1] ) ]
+  cds[b,ymd := as.character( df$ymd[1] ) ]
+  cds[b,Phase  :=   compute_phase(df$tStamp[1]) ]
+  cds[b,threadNum  :=  as.numeric(df[1,'threadNum']) ]
+  cds[b,NEvents  :=  nrow(df) ]
+  cds[b,ClinicDayStart :=  as.character(lubridate::ymd_hms(df$tStamp[1]))  ]
+  cds[b,ClinicDayDuration :=  difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units := 'hours' ) ]
+  cds[b,NumVisits  :=   length(unique( df$Visit_ID )) ]
+  cds[b,NumUniqueDiagnosisGroups  :=  length(unique( df$Diagnosis_Group )) ] 
+  cds[b,NumPhysicians  :=  length(unique( df$Physician )) ]
+  cds[b,TotalStaff  :=  length(unique( df$Role_ID )) ]
+  cds[b, NetComplexity := estimate_network_complexity( n ) ]
+  cds[b,Nodes := nrow(n$nodeDF) ]
+  cds[b,Edges := nrow(n$edgeDF) ]
+  cds[b,CompressRatio  :=  compression_index(df,new_event_col) ]
+  cds[b,Entropy  :=  compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+  cds[b,NumProceduresPerDay  :=  count_daily_procedures(df) ]
+  cds[b,CF_Alignment  :=  1 ]    # make placeholder, but compute below
+  cds[b,ALL_CF_count  :=  length(unique(df[[all_cf_col]])) ]
+  cds[b,ALL_CF_entropy  :=  compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
+  
+  # Count the unique elements in each cf  
+  for (cf in ALL_CFs){  cds[b, paste0(cf,"_countVR") :=  length(unique(df[[cf]])) ] }
+  
+  
+} 
   
   # Compute the alignment of the context factors
   cds$CF_Alignment =  as.numeric( as.character(cds$Action_count)) / as.numeric( as.character(cds$ALL_CF_count ))
@@ -466,7 +726,7 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
 #   if (b%%100==0) {print(b)}
 #   
 #   # select the threads that go in this bucket
-#     df = occ[occ[[TN]] ==bucket_list[i],]
+#     df = occ[occ[[TN]] ==bucket_list[i] ]
 # 
 #     # bucket number
 #     ACHR[b,bucket := b]
@@ -618,7 +878,7 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
 #     if (b%%10==0) {print(b)}
 #     
 #     # select the threads that go in this bucket
-#     df = occ[occ[[BU]] ==bucket_list[i],]
+#     df = occ[occ[[BU]] ==bucket_list[i] ]
 #     
 #     # bucket number
 #     ACHR[b,bucket := b]
@@ -700,3 +960,125 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
 #   return(ACHR)
 # }
 
+
+
+#  need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time.
+
+# ACHR_batch_threads_no_merge_old <- function(occ,THREAD_CFs, EVENT_CFs, ALL_CFs) {
+#   
+#   library(tidyr)
+#   library(dplyr)
+#   library(ThreadNet)
+#   library(ngram)
+#   library(lubridate)
+#   library(stringr)
+#   library(data.table)
+#   
+#   # Add  columns for combinations of CFs if needed
+#   thread_col = newColName(THREAD_CFs)
+#   TN = thread_col
+#   if  (!(thread_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,THREAD_CFs,thread_col) }
+# 
+#   
+#   new_event_col = newColName(EVENT_CFs)
+#   if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+#  
+#   all_cf_col = newColName(ALL_CFs)
+#   if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
+#   
+#   # set key on the data.table
+#   setkeyv(occ, TN)
+#   
+#   # get the list of buckets
+#   bucket_list <- unique(occ[[TN]])
+#   
+#   # print the number of buckets
+#   print(paste0('Number of buckets=', length(bucket_list)  ))
+#   
+#   
+#   # make data frame with results
+#  Thrds = data.table( t(sapply( bucket_list, 
+#                            function(b){
+#                              
+#                              # select a subset 
+#                              df= occ[get(TN)==b]
+#                     
+#                             # make sure it  is sorted by timestamp
+#                              df=df[order(df$tStamp),]
+#                              
+#                              # get the network -- only if there are enough rows...
+#                              
+#                               if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+#                               else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
+#                              
+#                              # compute each parameter and put them in a vector
+#                              c(
+#                                bucket=b,
+#                                Clinic = as.character(df$Clinic[1]),
+#                                Physician  = as.character(df$Physician[1]),
+#                                ymd = as.character(df$ymd[1]),
+#                                Clinic_ymd = as.character(df$Clinic_ymd[1]),
+#                                Weekday  = as.character(df$Weekday[1]),
+#                                Month  = as.character(df$Month[1]),
+#                                Phase =  compute_phase(df$tStamp[1]),
+#                                threadNum = as.numeric(df$threadNum[1]),
+#                                Visit_ID  = as.character(df$Visit_ID[1]),
+#                                Subject_ID  = as.character( df$Subject_ID[1] ),
+#                                Role_ID = as.character( df$Role_ID[1] ),
+#                                NEvents = nrow(df),
+#                                ThreadStart= as.character(df$tStamp[1] ),
+#                                ThreadDuration= compute_thread_duration(df), 
+#                                VisitDuration= compute_visit_duration(df), 
+#                                wait_time1 = compute_wait_time1(df),
+#                                wait_time2 = compute_wait_time2(df),
+#                                Visit_number = as.numeric(df[1,'Visitnum_Total']),
+#                                LOC_CPT =  as.character(df[1,'LOS_CPT']),
+#                                NetComplexity=estimate_network_complexity( n ),
+#                                Nodes=nrow(n$nodeDF),
+#                                Edges=nrow(n$edgeDF),
+#                                CompressRatio = compression_index(df,new_event_col),
+#                                Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]),
+#                                NumProcedures = count_procedures(df$Proc[1]),
+#                                NumDiagnoses = count_diagnoses(df$Diag[1]),
+#                                Proc = as.character(df[1,'Proc']),
+#                                Diagnosis = as.character(df[1,'Diag']),    
+#                                Diagnosis_group  = as.character(df[1,'Diagnosis_Group']),
+#                                CF_Alignment = 1,    # make placeholder, but compute below
+#                                ALL_CF_count = length(unique(df[[all_cf_col]])),
+#                                ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), 
+#                              
+#                              # name these columns afterwards
+#                              sapply(ALL_CFs, function(cf){
+#                                c( length(unique(df[[cf]]))
+#                                   # ,
+#                                   # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) 
+#                                   ) })
+#                              
+#                              )
+#                       } )))
+#  
+#  
+#  # name the last columns -- code has to match above
+#  cn =  as.vector(sapply(ALL_CFs, function(cf){
+#    c( paste0(cf,"_count")
+#     #   ,  
+#     # paste0(cf,"_entropy") 
+#     ) }))
+#  
+#  # now assign them to the last columns
+#  last_col = ncol(Thrds)
+#  first_col= last_col-length(cn)+1
+#  setnames(Thrds, c(first_col:last_col), cn)
+#  
+#  # Compute the alignment of the context factors
+#  Thrds$CF_Alignment =  as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count ))
+#  
+#  save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')
+#  save(Thrds, file=save_file_name)
+#  
+#  print(paste('Saved ', nrow(Thrds), " records in ",save_file_name))
+#  
+#  return(Thrds) 
+#   
+# }
+# 
diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R
index 5093f1e..2328452 100644
--- a/Data Prep/ACHR_Clinic_Trajectories_v3.R	
+++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R	
@@ -87,6 +87,7 @@ graph_trajectory  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1,
      # get text vector for the whole data set.  Bucket needs at least 2 threads
     # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
       th = e[ e[[bucket_col]] ==b , ] 
+      
       if (nrow(th)>3) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]  }  
       
       
@@ -139,20 +140,28 @@ graph_trajectory  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1,
     # 
     # # if the number if below the threshold, set it to zero
     # windowFreqMatrix[fm<=filter_threshold] = 0
-
-    df = data.frame( t(sapply( 1:(nWindows-1), 
-                                  function(i){
-                                    c(
-                                    bckt = blist[i],
-                                    Clinic = unlist(strsplit(blist[i],'_'))[1],
-                                    ymd =  unlist(strsplit(blist[i],'_'))[2],
-                                    pct_retained = sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0),
-                                    complexity = estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ),
-                                    Dist_from_reference = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ),
-                                    Dist_from_next = distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' )
-                                    )
-                                  })))
   
+  df = data.table(
+    Clinic_ymd = character(nWindows),
+    Clinic = character(nWindows),
+    ymd = character(nWindows),
+    pct_retained = double(nWindows),
+    complexity = double(nWindows),
+    Dist_from_reference = double(nWindows),
+    Dist_from_next = double(nWindows)
+  )
+
+   for (i in 1:(nWindows-1) ) {
+
+     df[i,  Clinic_ymd := blist[i] ]
+     df[i,  Clinic := unlist(strsplit(blist[i],'_'))[1] ]
+     df[i,  ymd :=  unlist(strsplit(blist[i],'_'))[2] ]
+     df[i,  pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ]
+     df[i,  complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ]
+     df[i,  Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ]
+     df[i,  Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ]
+   }
+   
     
  # get the ngram data and labels
     if (keep_ngram_vectors) {
@@ -170,5 +179,122 @@ graph_trajectory  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1,
 
 }
 
-
+graph_trajectory_week  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') {
+  
+  # add column for bucket_CF if needed
+  bucket_col = newColName(bucket_CFs)
+  if  (!(bucket_col  %in% colnames(e)))  {  e = combineContextFactors(occ,bucket_CFs,bucket_col) }
+  
+  # here is the sorted list of buckets
+  blist = sort(unique(e[[bucket_col]]))
+  
+  # now many buckets?
+  nb = length(blist)
+  # print(blist)
+  print(paste('Number of buckets =',nb))
+  
+  # make data frame for results
+  vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() )
+  
+  bcount=0
+  # scan through the data
+  for (b in blist){
+    bcount= bcount +1
+    # print(paste('b =',b))
+    
+    # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE...
+    # get text vector for the whole data set.  Bucket needs at least 2 threads
+    # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
+    th = e[ e[[bucket_col]] ==b , ] 
+    
+    if (nrow(th)>3) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]  }  
+    
+    
+    nodes = length(unique(th[[cf]]))
+    
+    
+    # print(paste('nrow ngdf =',nrow(ngdf)))
+    
+    # add the bucket number and name
+    ngdf$bid = bcount
+    ngdf$bname = b
+    
+    # append the columns to the end
+    # vt is the whole set of all ngrams in all the windows
+    vt=rbind(vt,ngdf)
+  }
+  
+  
+  # convert to factor
+  vt$ngrams = factor(vt$ngrams)
+  
+  # nWindows = length(unique(vt$bid))
+  nWindows = nb
+  #print(paste('nWindows =',nWindows))
+  
+  # get the set of unique ngrams for the whole data set
+  vt_unique = data.frame(ngrams=unique(vt$ngrams))
+  #print(vt_unique)
+  
+  # put the results here
+  windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique))
+  
+  for (i in 1:nWindows){
+    
+    # get the merged list
+    vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE)
+    
+    # use the bid.y to get the whole vector, but replace the NA with zeros
+    bb=vtmerge[vtmerge$bid==i,'freq']
+    bb[is.na(bb)] <- 0
+    
+    windowFreqMatrix[i,]=bb
+    
+    print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0)))
+    
+  }
+  
+  # #Make a matrix to filter by where all the entries are 0 < f < 1
+  # fm = windowFreqMatrix/max(windowFreqMatrix)
+  # 
+  # # if the number if below the threshold, set it to zero
+  # windowFreqMatrix[fm<=filter_threshold] = 0
+  
+  df = data.table(
+    Clinic_week = character(nWindows),
+    Clinic = character(nWindows),
+    ymd = character(nWindows),
+    pct_retained = double(nWindows),
+    complexity = double(nWindows),
+    Dist_from_reference = double(nWindows),
+    Dist_from_next = double(nWindows)
+  )
+  
+  for (i in 1:(nWindows-1) ) {
+    df[i,  Clinic_week := blist[i] ]
+    # df[i,  Clinic_ymd := blist[i] ]
+    # df[i,  Clinic := unlist(strsplit(blist[i],'_'))[1] ]
+    # df[i,  ymd :=  unlist(strsplit(blist[i],'_'))[2] ]
+    df[i,  pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ]
+    df[i,  complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ]
+    df[i,  Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ]
+    df[i,  Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ]
+  }
+  
+  
+  # get the ngram data and labels
+  if (keep_ngram_vectors) {
+    b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),])
+    colnames(b_df)=vt_unique$ngrams
+    
+    # stick the ngram frequencies on the end for good measure
+    df = cbind(df,b_df)
+  }
+  
+  # save the result and return it, too
+  save(df, file=paste0(save_file_name,'.rData') )
+  
+  return(df)
+  
+}
  
\ No newline at end of file
diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R
index a6e05a5..611f271 100644
--- a/Tables and Figures/Clinic_day_plots.R	
+++ b/Tables and Figures/Clinic_day_plots.R	
@@ -37,11 +37,121 @@ plots_for_papers <- function(){
   ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Clinic)) + geom_line(aes(color=Clinic))
   ggplot(data = cds, aes(x = ymd, y = rollmean(Dist_from_reference,5,na.pad=TRUE), group=Clinic)) + geom_line(aes(color=Clinic))
   
+  # Dec 6th version 
+  ggplot(data = cdt, aes(x = ymd.x, y = Dist_from_reference, group=Clinic.x)) + 
+    geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank())
+
+    ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+    geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank())
+    
+    ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+  
+  
   # ROle change over time
   ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Role)) + geom_line(aes(color=Role))
   
+  # get the wait times and visit duration for each clinic
+  # convert to minutes. 
+ a= visits  %>% 
+    group_by(Clinic) %>%
+    summarize(n = n(),
+              NEvents = mean(NEvents, na.rm = TRUE),
+              wait1 = mean(wait1*60, na.rm = TRUE),
+              sdwait1 = sd(wait1*60, na.rm = TRUE),
+              wait2 = mean(wait2*60, na.rm = TRUE),
+              sdwait2 = sd(wait2*60, na.rm = TRUE),
+              Duration  = mean(VisitDuration, na.rm = TRUE)
+              )
+  
+ # For some reason, the sd function was not working correctly above...  do it again here. 
+ visits  %>% 
+    group_by(Clinic) %>%
+    summarize(n = n(),
+              sdwait1 = sd(wait1*60, na.rm = TRUE),
+              sdwait2 = sd(wait2*60, na.rm = TRUE)
+    )
+  
+  # get the precentage of actions by role... based on occurrences
+ Clinic_role_pct = otr %>%
+    group_by(Clinic, Role)  %>%
+    summarize(n=n()) %>%
+    mutate(RolePct = n/sum(n))
+    
+ # get LOS by clinic based on visits
+ Clinic_LOS_pct = visits %>%
+   group_by(Clinic, LOS_CPT)  %>%
+   summarize(n=n()) %>%
+   mutate(LOSPct = n/sum(n)) %<%
+   spread(Clinic, Role)
+ 
+ # Make contingency table Clinic x  LOS
+ C_LOS = visits %>%
+   group_by(Clinic, LOS_CPT)  %>%
+   summarize(n=n()) %>%
+   spread( LOS_CPT,n)
+ 
+ # Convert NA to zero and then convert to table. Carefully remove  rows/cols that make no sense
+ C_LOS= as.data.frame(C_LOS)
+ C_LOS[is.na(C_LOS)] =0
+ C_LOS = matrix(unlist(C_LOS), nrow=6, ncol=20)[1:5,2:19]
+ chisq.test(C_LOS)
+ 
+
+# Make contingency table Clinic x  Role
+C_R = otr %>%
+  group_by(Clinic, Role)  %>%
+  summarize(n=n()) %>%
+  spread( Role, n)
+
+# Convert NA to zero
+C_R = as.data.frame(C_R)
+C_R[is.na(C_R)] =0
+C_R = matrix(unlist(C_R), nrow=5, ncol=10)[1:5,2:10]
+chisq.test(C_R)
+
+ Clinic_staffing = otr %>%
+  group_by(Clinic,Role)  %>%
+  summarize(total_staff=n_distinct(Role_ID)) %>%
+  spread( Role, total_staff)
+ 
+ 
+ # Get the handoffs per visit.  Use the distinct chunks in the Visit-Role threads (VRThrds)
+ Role_Handoffs = VRThrds %>%
+   group_by(Clinic,Visit_ID) %>%
+   summarize(ho = n_distinct(threadNumVR))
+
+ Role_Handoffs_by_clinic = Role_Handoffs %>%
+   group_by(Clinic) %>%
+   summarize(avg_ho = mean(ho), 
+             median_ho = median(ho),
+             max_ho = max(ho),
+             sd_ho = sd(ho))
+ 
+ # get the average staffing levels per day
+ Clinic_daily_staffing = otr %>%
+   group_by(Clinic,ymd,Role)  %>%
+   summarize(total_staff=n_distinct(Role_ID)) 
+ 
+ Clinic_AVG_daily_staffing = Clinic_daily_staffing %>%
+   group_by(Clinic,Role)  %>%
+   summarize(AVG_staff=mean(total_staff))  %>%
+   spread( Role, AVG_staff)
+ 
+}
+
+
+remove_outliers <- function(x, na.rm = TRUE, ...) {
+  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
+  H <- 1.5 * IQR(x, na.rm = na.rm)
+  y <- x
+  y[x < (qnt[1] - H)] <- NA
+  y[x > (qnt[2] + H)] <- NA
+  y
 }
 
+
 # 
 # make_box_plots <- function(){
 # ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity",

From 169b388510549984d09c99e41473c466ca91aa9b Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 13 Dec 2018 05:35:26 -0500
Subject: [PATCH 19/31] Added functions to fix roles and compute LOS data

---
 Data Prep/ACHR_Batch_Aggregate_Data.R | 28 ++++++++++++++++++---
 Data Prep/ACHR_Batch_PreProcess.R     | 31 ++++++++++++++++++++++++
 Tables and Figures/Clinic_day_plots.R | 35 +++++++++++++++++++++++++--
 3 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index 4bbd456..6e10653 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -79,6 +79,7 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) {
                     VisitDuration= double(N),
                     wait_time1 = double(N),
                     wait_time2 = double(N),
+                    LOS =double(N),
                     NetComplexity=double(N),
                     Nodes=double(N),
                     Edges=double(N),
@@ -135,7 +136,10 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) {
   } 
   
   # Compute the alignment of the context factors
-  ACHR$CF_Alignment =  as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count ))
+  ACHR$CF_Alignment :=  as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count ))
+  
+  # convert level of service to 1-5 integer
+  ACHR$LOS := convert_LOS( occ$LOS_CPT )
   
   # Merge the results with the first row from each thread
   print('Merging results...')
@@ -200,6 +204,7 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) {
                     NEventsVR = integer(N),
                     threadStartVR =  character(N), 
                     ThreadDurationVR =double(N),
+                    LOS =double(N),
                     NetComplexityVR=double(N),
                     NodesVR=double(N),
                     EdgesVR=double(N),
@@ -250,6 +255,9 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) {
   # Compute the alignment of the context factors
  #  ACHR$CF_AlignmentVR =  as.numeric( as.character(ACHR$Action_countVR)) / as.numeric( as.character(ACHR$ALL_CF_countVR ))
   
+  # convert level of service to 1-5 integer
+  ACHR$LOS := convert_LOS( occ$LOS_CPT )
+  
   # Merge the results with the first row from each thread
   print('Merging results...')
   VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID' ,all=TRUE) 
@@ -536,10 +544,22 @@ count_daily_procedures <- function( df ) {
   
 }
 
-# get_timeScale <- function(){'hr'}
-
-
+# convert the LOS_CPT codes into a five point numeric scale
+# operate on whole column to create new column
+convert_LOS <- function(los_cpt){
+  
+  # get the 5th character
+  s = sapply(los_cpt, function(cpt)  { substring(as.character(cpt),5,5) } )
+  
+  los = as.numeric(s)
+  
+  # convert 9 and NA to 1
+  los[los==9]=1
+  los[is.na(los)]=1
 
+return(los)  
+  
+}
 
 ###################################################################
 ###################################################################
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index 27db4af..05ac916 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -32,6 +32,9 @@ redo_ACHR_data_from_scratch  <- function(fname){
   #  first read the data
   o = read_ACHR_data( fname )
   
+  # fix the role IDs -- needs file called RoleChangeTable.csv in the local directory
+  o = fix_derm_role_ID(o)
+  
   # Thread occurrences  adds threadNum and seqNum to each thread, as defined by the the thread_CF.
   # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role)
   occ = thread_occurrences( o,  THREAD_CF ,fname )
@@ -123,6 +126,34 @@ cleanOccBatch <- function(fileRows){
   return(complete)
 }
   
+#######################################################################################
+# In the derm data, the role "technician" was used for people who were not technicians
+# Pass in the table of occurrences, fix it, and return it
+# read in the list of changes
+fix_derm_role_ID <- function(o){
+  
+  # read in the new role ID
+  rc = read.csv('RoleChangeTable.csv')
+  
+  
+  #  occurrences are a data.table so use :=
+  for (r in 1:nrow(rc)) {
+    
+    old_rid = rc$OLD_Role_ID[r]
+    new_rid = rc$NEW_Role_ID[r]
+    new_role = rc$NEW_Role[r]
+    
+    o[o$Role_ID==old_rid, Role_ID := new_rid]
+    o[o$Role_ID==old_rid, Role := new_role]
+    
+  }
+  
+  return(o)
+}
+
+
+
+
 #########################################################################################
 # This function adds columns  for  the the thread, as requested ( Tested Nov 16 )
 # Then it sorts by thread and tStamp and  adds thread/sequence numbers to  each thread.
diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R
index 611f271..d49a4ef 100644
--- a/Tables and Figures/Clinic_day_plots.R	
+++ b/Tables and Figures/Clinic_day_plots.R	
@@ -44,8 +44,8 @@ plots_for_papers <- function(){
     ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
     geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank())
     
-    ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
-      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    ggplot(data = cdt_435[,1:6], aes(x = ymd, y = rollmean( Dist_from_reference, 10,na.pad=TRUE), group=Clinic)) + 
+      geom_line(aes(color=Clinic)) + theme(axis.text.x=element_blank())
     
   
   
@@ -139,6 +139,37 @@ chisq.test(C_R)
    summarize(AVG_staff=mean(total_staff))  %>%
    spread( Role, AVG_staff)
  
+ # let's look at the actions by clinic_role, using the visit_role threads
+actions_by_role =  VRThrds %>%
+   group_by(Clinic,Role_VR) %>%
+   summarize(avgActions = mean(Action_countVR))    %>%
+   spread( Role_VR, avgActions)
+
+# Look at alignment by clinic
+visits %>%
+  group_by(Clinic,Phase)  %>%
+  summarize(w=max(ThreadDuration,na.rm = TRUE)  )   %>%
+  spread( Phase, w)
+ 
+
+# Get LOS by clinic-day
+visits %>%
+  group_by(Clinic_ymd)  %>%
+  summarize(los=mean(LOS,na.rm = TRUE)  )
+
+
+# See if diagnoses differ by  time period
+Diag_by_phase = visits %>%
+  group_by(Phase,Diagnosis_Group)  %>%
+  summarize(n=n() )  %>%
+  spread( Diagnosis_Group, n)
+
+# See if actions differ by time period
+#  cds_435_1 is a dataframe with one for for each clinic_day and ~300 columns for the actions
+Action_by_phase = cds_435_1 %>%
+  group_by(Phase)  %>%
+  summarize_at(8:307,median ) 
+
 }
 
 

From 505f09ced5226f6082b22841ea0eee9ef6117a34 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 13 Dec 2018 06:12:50 -0500
Subject: [PATCH 20/31] fixed function for updating Role and Role_ID

---
 Data Prep/ACHR_Batch_PreProcess.R | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index 05ac916..af432be 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -22,6 +22,8 @@ library(lubridate)
 
 redo_ACHR_data_from_scratch  <- function(fname){
 
+  setwd("~/Documents/@NSF Healthcare Routines/Deidentified_Derm_data_Nov_2018")
+  
   fname = 'audit_111818'
   
   # Pick you point of view, or multiple POV...
@@ -37,7 +39,10 @@ redo_ACHR_data_from_scratch  <- function(fname){
   
   # Thread occurrences  adds threadNum and seqNum to each thread, as defined by the the thread_CF.
   # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role)
-  occ = thread_occurrences( o,  THREAD_CF ,fname )
+  # make two versions -- one for whole visits and one for visit_ID_Role
+  ot = thread_occurrences( o, 'Visit_ID' ,fname )
+  otr = thread_occurrences( o, c('Visit_ID','Role') ,fname )
+  
   
   #  Now aggregate by clinic-day for  each POV
   clinic_days = ACHR_batch_clinic_days(occ, THREAD_CF, EVENT_CF, fname)
@@ -133,7 +138,7 @@ cleanOccBatch <- function(fileRows){
 fix_derm_role_ID <- function(o){
   
   # read in the new role ID
-  rc = read.csv('RoleChangeTable.csv')
+  rc = read.csv('RoleChangeTable.csv', stringsAsFactors = FALSE)
   
   
   #  occurrences are a data.table so use :=
@@ -143,8 +148,14 @@ fix_derm_role_ID <- function(o){
     new_rid = rc$NEW_Role_ID[r]
     new_role = rc$NEW_Role[r]
     
-    o[o$Role_ID==old_rid, Role_ID := new_rid]
-    o[o$Role_ID==old_rid, Role := new_role]
+    # pick out rows with old Role_ID
+    # old_rid_vector = o$Role_ID==old_rid
+    # 
+    # o[old_rid_vector, Role_ID := new_rid]
+    # o[old_rid_vector, Role := new_role]
+    
+    # crazy data table  syntax... do it all in one shot
+    o[o$Role_ID==old_rid, ':=' (Role = new_role, Role_ID = new_rid ) ]
     
   }
   

From d5ea7c8987283d5bc7929f0c7ca33fec6914dbd2 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Fri, 14 Dec 2018 07:24:02 -0500
Subject: [PATCH 21/31] Fixed bugs with LOS assignment

---
 Data Prep/ACHR_Batch_Aggregate_Data.R | 15 +++++----------
 Data Prep/ACHR_Batch_PreProcess.R     |  6 +++---
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index 6e10653..bdd2d57 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -136,10 +136,10 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) {
   } 
   
   # Compute the alignment of the context factors
-  ACHR$CF_Alignment :=  as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count ))
+  ACHR$CF_Alignment =  as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count ))
   
-  # convert level of service to 1-5 integer
-  ACHR$LOS := convert_LOS( occ$LOS_CPT )
+  # convert level of service to 1-5 integer 
+  ACHR$LOS = convert_LOS( firstOcc$LOS_CPT )
   
   # Merge the results with the first row from each thread
   print('Merging results...')
@@ -252,11 +252,6 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) {
     for (cf in ALL_CFs){  ACHR[b, paste0(cf,"_countVR") :=  length(unique(df[[cf]])) ] }
   } 
   
-  # Compute the alignment of the context factors
- #  ACHR$CF_AlignmentVR =  as.numeric( as.character(ACHR$Action_countVR)) / as.numeric( as.character(ACHR$ALL_CF_countVR ))
-  
-  # convert level of service to 1-5 integer
-  ACHR$LOS := convert_LOS( occ$LOS_CPT )
   
   # Merge the results with the first row from each thread
   print('Merging results...')
@@ -554,8 +549,8 @@ convert_LOS <- function(los_cpt){
   los = as.numeric(s)
   
   # convert 9 and NA to 1
-  los[los==9]=1
-  los[is.na(los)]=1
+  los[los==9]=NA
+
 
 return(los)  
   
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index af432be..549c8aa 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -32,7 +32,7 @@ redo_ACHR_data_from_scratch  <- function(fname){
   ALL_CF = c('Action','Role','Workstation')
   
   #  first read the data
-  o = read_ACHR_data( fname )
+  o = read_ACHR_data( fname )  # nrow(o) = 7728927
   
   # fix the role IDs -- needs file called RoleChangeTable.csv in the local directory
   o = fix_derm_role_ID(o)
@@ -40,8 +40,8 @@ redo_ACHR_data_from_scratch  <- function(fname){
   # Thread occurrences  adds threadNum and seqNum to each thread, as defined by the the thread_CF.
   # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role)
   # make two versions -- one for whole visits and one for visit_ID_Role
-  ot = thread_occurrences( o, 'Visit_ID' ,fname )
-  otr = thread_occurrences( o, c('Visit_ID','Role') ,fname )
+  ot = thread_occurrences( o, 'Visit_ID' ,fname )   # nrow(ot)= 57835
+  otr = thread_occurrences( o, c('Visit_ID','Role') ,fname ) # nrow(otr) = 527666
   
   
   #  Now aggregate by clinic-day for  each POV

From 91202e290b3a8b6cd41a0ea0da205cdc77ca732d Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Tue, 18 Dec 2018 23:19:11 -0500
Subject: [PATCH 22/31] Added function to find and count patterns of action

---
 .gitignore                            |  1 +
 Data Prep/ACHR_Batch_PreProcess.R     |  9 +++--
 Tables and Figures/Clinic_day_plots.R | 47 +++++++++++++++++++++++----
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/.gitignore b/.gitignore
index 998975a..496052d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,4 @@ emrt+Visit_ID+Role_Action_Workstation.Rdata
 emr+Visit_ID.Rdata
 Thrds+threadNum+Action_Role_Workstation.Rdata
 Nov 25 am workspace.RData
+deleteme.rData
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index 549c8aa..3b46176 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -48,10 +48,15 @@ redo_ACHR_data_from_scratch  <- function(fname){
   clinic_days = ACHR_batch_clinic_days(occ, THREAD_CF, EVENT_CF, fname)
   
   # Aggregate by visit for each POV
-  visits = ACHR_batch_visits(occ, THREAD_CF, EVENT_CF, fname)
+  # Whole visits
+  visits = ACHR_batch_visits(ot, THREAD_CF, EVENT_CF, fname)
+  
+  # And then by role
+  VRThreads <- ACHR_batch_visit_role_threads(otr, c('Action'), c('Action','Role','Workstation') , visits)
+
     
   # compute  trajectories to see difference from a reference graph
-  traj = graph_trajectory( occ, 'Clinic_ymd', EVENT_CF, 2, 1, 0, 'Clinic_trajectory')
+  traj = graph_trajectory( ot, 'Clinic_ymd', EVENT_CF, 2, 1, 0, 'Clinic_trajectory')
   
 }
 
diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R
index d49a4ef..af69091 100644
--- a/Tables and Figures/Clinic_day_plots.R	
+++ b/Tables and Figures/Clinic_day_plots.R	
@@ -77,13 +77,15 @@ plots_for_papers <- function(){
  Clinic_role_pct = otr %>%
     group_by(Clinic, Role)  %>%
     summarize(n=n()) %>%
-    mutate(RolePct = n/sum(n))
+    mutate(RolePct = n/sum(n))  %>%
+   select(Clinic, Role, RolePct) %>%
+   spread(Role, RolePct)
     
  # get LOS by clinic based on visits
  Clinic_LOS_pct = visits %>%
    group_by(Clinic, LOS_CPT)  %>%
    summarize(n=n()) %>%
-   mutate(LOSPct = n/sum(n)) %<%
+   mutate(LOSPct = n/sum(n)) %>%
    spread(Clinic, Role)
  
  # Make contingency table Clinic x  LOS
@@ -118,7 +120,7 @@ chisq.test(C_R)
  
  
  # Get the handoffs per visit.  Use the distinct chunks in the Visit-Role threads (VRThrds)
- Role_Handoffs = VRThrds %>%
+ Role_Handoffs = VRThreads %>%
    group_by(Clinic,Visit_ID) %>%
    summarize(ho = n_distinct(threadNumVR))
 
@@ -136,11 +138,16 @@ chisq.test(C_R)
  
  Clinic_AVG_daily_staffing = Clinic_daily_staffing %>%
    group_by(Clinic,Role)  %>%
-   summarize(AVG_staff=mean(total_staff))  %>%
+   summarize(AVG_staff=mean(total_staff, na.rm = TRUE))  %>%
+   spread( Role, AVG_staff)
+ 
+ Clinic_AVG_daily_staffing = Clinic_daily_staffing %>%
+   group_by(Clinic,Role)  %>%
+   summarize(AVG_staff=median(total_staff, na.rm = TRUE))  %>%
    spread( Role, AVG_staff)
  
  # let's look at the actions by clinic_role, using the visit_role threads
-actions_by_role =  VRThrds %>%
+actions_by_role =  VRThreads %>%
    group_by(Clinic,Role_VR) %>%
    summarize(avgActions = mean(Action_countVR))    %>%
    spread( Role_VR, avgActions)
@@ -153,7 +160,7 @@ visits %>%
  
 
 # Get LOS by clinic-day
-visits %>%
+LOS_by_clinic_day = visits %>%
   group_by(Clinic_ymd)  %>%
   summarize(los=mean(LOS,na.rm = TRUE)  )
 
@@ -182,6 +189,34 @@ remove_outliers <- function(x, na.rm = TRUE, ...) {
   y
 }
 
+# converts visit_ID into character string with  factor  levels for actions
+make_visit_string <- function(occ, vid){
+  
+  return(paste(as.character(as.numeric(occ$Action[occ$Visit_ID==as.character(vid)])),collapse = '-'))
+}
+
+# Converts list of actions into regex of factor levels  
+make_process_pattern <- function(lvls,  p){
+  
+  return( paste( sapply(p, function(x) {as.character(which(lvls==x))}) , collapse = '.*') )
+}
+
+# Find process patterns
+find_process_pattern <- function(occ,  p, vid){
+  
+  x = make_process_pattern( levels(ot$Action),  p)
+  print(x)
+  
+   m = sapply(vid, function(v) {
+                              (grepl( x, make_visit_string(occ, v))) } )
+  # 
+  
+
+#  m = grep( x, make_visit_string(occ, vid))
+  
+  return(m)
+  
+}
 
 # 
 # make_box_plots <- function(){

From 415aae73bc9828275daba8eb4579dd44918ca4c8 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Fri, 21 Dec 2018 13:22:42 -0500
Subject: [PATCH 23/31] minor changes

---
 Data Prep/ACHR_Clinic_Trajectories_v3.R |  2 ++
 Tables and Figures/Clinic_day_plots.R   | 33 ++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R
index 2328452..21002bf 100644
--- a/Data Prep/ACHR_Clinic_Trajectories_v3.R	
+++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R	
@@ -178,7 +178,9 @@ graph_trajectory  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1,
   return(df)
 
 }
+##################################################################################
 
+##################################################################################
 graph_trajectory_week  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') {
   
   # add column for bucket_CF if needed
diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R
index af69091..6172796 100644
--- a/Tables and Figures/Clinic_day_plots.R	
+++ b/Tables and Figures/Clinic_day_plots.R	
@@ -44,11 +44,38 @@ plots_for_papers <- function(){
     ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
     geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank())
     
-    ggplot(data = cdt_435[,1:6], aes(x = ymd, y = rollmean( Dist_from_reference, 10,na.pad=TRUE), group=Clinic)) + 
-      geom_line(aes(color=Clinic)) + theme(axis.text.x=element_blank())
+    ggplot(data = cds_435_1 %>% filter(Phase=='one'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
     
+    ggplot(data = cds_435_1 %>% filter(Phase=='two'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435_1 %>% filter(Phase=='three'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435_1 %>% filter(Phase=='four'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435_1 %>% filter(Phase=='five'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
   
-  
+  # Here are the pairs
+    ggplot(data = cds_435 %>% filter(Phase=='one'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435 %>% filter(Phase=='two'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435 %>% filter(Phase=='three'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435 %>% filter(Phase=='four'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435 %>% filter(Phase=='five'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    
   # ROle change over time
   ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Role)) + geom_line(aes(color=Role))
   

From 47e1507fba2415e1534aef62361f0704b167359a Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Fri, 21 Dec 2018 13:23:33 -0500
Subject: [PATCH 24/31] added plots for clinic days

---
 Tables and Figures/Clinic_day_plots.R | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R
index 6172796..18a6ce7 100644
--- a/Tables and Figures/Clinic_day_plots.R	
+++ b/Tables and Figures/Clinic_day_plots.R	
@@ -41,9 +41,7 @@ plots_for_papers <- function(){
   ggplot(data = cdt, aes(x = ymd.x, y = Dist_from_reference, group=Clinic.x)) + 
     geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank())
 
-    ggplot(data = cdt, aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
-    geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank())
-    
+     
     ggplot(data = cds_435_1 %>% filter(Phase=='one'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
       geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
     
@@ -75,6 +73,11 @@ plots_for_papers <- function(){
     ggplot(data = cds_435 %>% filter(Phase=='five'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
       geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
     
+    #  these are based on sub-sets  of the data by phase
+    ggplot(data = cdp2, aes(x = ymd, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic)) + 
+      geom_line(aes(color=Clinic)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank())
+    
+    
     
   # ROle change over time
   ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Role)) + geom_line(aes(color=Role))

From 2c891d59adaeb097f2cf7b3d70cb33b8d63bf2de Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Sun, 6 Jan 2019 21:12:12 -0500
Subject: [PATCH 25/31] dual window for self comparison

---
 Data Prep/ACHR_Clinic_Trajectories_v3.R |  2 +-
 Tables and Figures/Clinic_day_plots.R   | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R
index 21002bf..06cd5ad 100644
--- a/Data Prep/ACHR_Clinic_Trajectories_v3.R	
+++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R	
@@ -122,7 +122,7 @@ graph_trajectory  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1,
   
   for (i in 1:nWindows){
     
-    # get the merged list
+    # get the merged listunqi
     vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE)
     
     # use the bid.y to get the whole vector, but replace the NA with zeros
diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R
index 18a6ce7..b105277 100644
--- a/Tables and Figures/Clinic_day_plots.R	
+++ b/Tables and Figures/Clinic_day_plots.R	
@@ -248,6 +248,20 @@ find_process_pattern <- function(occ,  p, vid){
   
 }
 
+dualWindowCosine <- function(df, clinic_col, colrange, windowsize){
+  
+  
+  rdf = data.frame(clinic = df[windowsize:(nrow(df)-windowsize),..clinic_col],
+                   ymd =  df[windowsize:(nrow(df)-windowsize),ymd],
+                   corr = sapply(windowsize:(nrow(df)-windowsize), 
+                                 function(x) { 
+                                   distance(rbind( colSums(df[(x-windowsize+1):x,..colrange]),colSums(df[(x+1):(x+windowsize), ..colrange ]) ), 
+                                            method='cosine' ) })
+  )
+  return(rdf)
+}
+
+
 # 
 # make_box_plots <- function(){
 # ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity",

From c2a1df228a71c197bc5e8cdd5bd8aa5821f6f763 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 18 Apr 2019 21:24:55 -0400
Subject: [PATCH 26/31] many changes

---
 Data Prep/ACHR_Batch_Aggregate_Data.R   | 464 ++----------------------
 Data Prep/ACHR_Batch_V3.R               |  45 ++-
 Data Prep/ACHR_Clinic_Trajectories_v3.R | 150 +++++++-
 Data Prep/Comparison_functions.R        |  17 +
 4 files changed, 230 insertions(+), 446 deletions(-)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index bdd2d57..8accb84 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -20,6 +20,8 @@
 # c) merge them back  together
 # d) Probably need to do different versions for visit vs. visit_role because a lot of the parameters make no sense...
 
+# April 2019 Added  compute_graph_entropy
+
 ###################################################################
 ###################################################################
 # This is function is set up to aggregate the occurrences within threads -- typically visits
@@ -27,7 +29,7 @@
 # TN = threadNum in most cases
 # CFs can be chosen
 # event_CFs define changes within threads
-# ALL_CFs are used to computer the CF_alignment
+# ALL_CFs are used to compute the CF_alignment
 
 
 
@@ -125,11 +127,13 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) {
     ACHR[b, Nodes:=nrow(n$nodeDF) ]
     ACHR[b,Edges:=nrow(n$edgeDF) ]
     ACHR[b,CompressRatio := compression_index(df,new_event_col) ]
-    ACHR[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+  #  ACHR[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+    ACHR[b,Entropy := compute_graph_entropy( df[[new_event_col]]) ]
     ACHR[b,NumProcedures := count_procedures(df$Proc[1]) ]
     ACHR[b,NumDiagnoses := count_diagnoses(df$Diag[1]) ]
     ACHR[b,ALL_CF_count := length(unique(df[[all_cf_col]])) ]
-    ACHR[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
+    # ACHR[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ]
+    ACHR[b,ALL_CF_entropy := compute_graph_entropy( df[[all_cf_col]])  ] 
     
     # Count the unique elements in each cf  
     for (cf in ALL_CFs){  ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ] }
@@ -244,9 +248,12 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) {
     ACHR[b, NodesVR:=nrow(n$nodeDF) ]
     ACHR[b,EdgesVR:=nrow(n$edgeDF) ]
     ACHR[b,CompressRatioVR := compression_index(df,new_event_col) ]
-    ACHR[b,EntropyVR := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+    # ACHR[b,EntropyVR := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+    ACHR[b,EntropyVR := compute_graph_entropy( df[[new_event_col]])  ]
+    
     ACHR[b,ALL_CF_countVR := length(unique(df[[all_cf_col]])) ]
-    ACHR[b,ALL_CF_entropyVR := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
+    # ACHR[b,ALL_CF_entropyVR := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
+    ACHR[b,ALL_CF_entropyVR := compute_graph_entropy( df[[all_cf_col]])  ] 
     
     # Count the unique elements in each cf  
     for (cf in ALL_CFs){  ACHR[b, paste0(cf,"_countVR") :=  length(unique(df[[cf]])) ] }
@@ -650,11 +657,14 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
   cds[b,Nodes := nrow(n$nodeDF) ]
   cds[b,Edges := nrow(n$edgeDF) ]
   cds[b,CompressRatio  :=  compression_index(df,new_event_col) ]
-  cds[b,Entropy  :=  compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+  # cds[b,Entropy  :=  compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+  cds[b,Entropy  :=  compute_graph_entropy( df[[new_event_col]])  ]
+  
   cds[b,NumProceduresPerDay  :=  count_daily_procedures(df) ]
   cds[b,CF_Alignment  :=  1 ]    # make placeholder, but compute below
   cds[b,ALL_CF_count  :=  length(unique(df[[all_cf_col]])) ]
-  cds[b,ALL_CF_entropy  :=  compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
+  # cds[b,ALL_CF_entropy  :=  compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
+  cds[b,ALL_CF_entropy  :=  compute_graph_entropy( df[[all_cf_col]])  ] 
   
   # Count the unique elements in each cf  
   for (cf in ALL_CFs){  cds[b, paste0(cf,"_countVR") :=  length(unique(df[[cf]])) ] }
@@ -671,429 +681,19 @@ ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
   
 }
 
+compute_graph_entropy <- function(s){
+  
+  # guard against insufficient data...
+  if (length(s) <2) return(0)
+  
+  # convert s into text vector
+  text_vector =  concatenate(s) 
+  
+  # get the 2-grams in the sequence s. 'prop' is the proportion of each edge. It sums to 1. 
+  p = get.phrasetable(ngram(text_vector,2))[['prop']]
+  
+  # return Shannon entropy
+  return(-sum(p*log(p)))
+  
+}
 
-###################################################################
-###################################################################
-##  OLD   ###
-##  OLD   ###
-##  OLD   ###
-# ACHR_batch_threads_old <- function(occ,TN, EVENT_CFs, ALL_CFs) {
-# 
-#   library(tidyr)
-#   library(dplyr)
-#   library(ThreadNet)
-#   library(ngram)
-#   library(lubridate)
-#   library(stringr)
-#   
-#   new_event_col = newColName(EVENT_CFs)
-#   if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
-#   
-# # pick subsets --here we aggregate one thread at a time  
-# bucket_list <- make_buckets_1(occ, TN)
-# 
-# # get the size (number of buckets)
-# N = length(bucket_list)
-# c
-# # pre-allocate the result
-# ACHR = data.table(bucket=integer(N),
-#                   NEvents = integer(N),
-#                   ThreadStart= character(N),
-#                   ThreadStartInt = integer(N),
-#                   ThreadDuration =double(N),   
-#                   NetComplexity=double(N),
-#                   Nodes=double(N),
-#                   Edges=double(N),
-#                   CompressRatio = double(N),
-#                   Entropy = double(N),
-#                   NumProcedures = double(N),
-#                   NumDiagnoses = double(N),
-#                   wait_time = double(N),
-#                   Visit_number = integer(N),
-#                   LOC_CPT =  character(N), 
-#                   Visit_ID  = character(N), 
-#                   Subject_ID  = character(N), 
-#                   Clinic = character(N),
-#                   ymd = character(N),
-#                   Clinic_ymd = character(N),
-#                   Procedure = character(N),
-#                   Diagnosis = character(N),   # diag in the raw data
-#                   Diagnosis_group  = character(N), 
-#                   Physician  = character(N), 
-#                   Weekday  = character(N), 
-#                   Month  = character(N),
-#                   Phase =  character(N),
-#                   CF_Alignment = double(N) 
-#                   )
-# 
-# # Now add columns for the CFs.  There will be two for each CF
-# for (cf in ALL_CFs){
-#   ACHR[, paste0(cf,"_count"):= double(N)]
-#   ACHR[, paste0(cf,"_entropy"):= double(N)]
-# }
-# 
-# # loop through the buckets. Result will be data frame with one row per bucket
-# for (i in 1:N){
-# 
-#   b = i #  as.integer(bucket_list[i])
-# 
-#   # print once every 100 visits
-#   if (b%%100==0) {print(b)}
-#   
-#   # select the threads that go in this bucket
-#     df = occ[occ[[TN]] ==bucket_list[i] ]
-# 
-#     # bucket number
-#     ACHR[b,bucket := b]
-# 
-#     # length of the thread (number of rows)
-#     ACHR[b,NEvents := nrow(df)]
-# 
-#     # only do the computations if there are more than two occurrences
-#     if (nrow(df) > 2) {
-# 
-#       # compute the duration of the visit in hours
-#       ACHR[b,ThreadDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
-#      
-#     # compressibility of DV
-#       ACHR[b,CompressRatio := compression_index(df,new_event_col)]
-#       # ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
-#       # ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
-#       
-#     # NetComplexity of DV
-#     # First get the network
-#       # NetComplexity of DV
-#       # First get the network
-#       n = threads_to_network_original(df,TN, new_event_col)
-#       ACHR[b,NetComplexity := estimate_network_complexity( n )]
-#       ACHR[b,Nodes := nrow(n$nodeDF) ]
-#       ACHR[b,Edges := nrow(n$edgeDF) ]
-#       
-#     
-#   # compute stuff on each context factor
-#   for (cf in ALL_CFs){
-# 
-#     # Count the unique elements in each cf
-#     ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ]
-# 
-#     # get the entropy
-#     ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
-# 
-#   }
-# } # kf nrows > 2
-# 
-#     # Now copy in the rest of data  
-#     # this only works because one visit is one bucket
-#     
-#  #   ACHR[b,'VisitStart'  := as.POSIXct( df[1,'tStamp']) ] 
-#     ACHR[b,'ThreadStartInt'  :=   df[1,'tStamp'] ] 
-#     
-#     ACHR[b,'ThreadStart'  :=   as.character( df[1,'tStamp'] ) ] 
-#     
-#     ACHR[b,'Visit_ID'  := df[1,'Visit_ID']]
-#     ACHR[b,'Subject_ID'  := df[1,'Subject_ID']]
-#     ACHR[b,'Clinic' := df[1,'Clinic']]
-#     ACHR[b,'Procedure' := df[1,'Proc']]
-#     ACHR[b,'Diagnosis' := df[1,'Diag']]
-#     ACHR[b,'Diagnosis_group'  := df[1,'Diagnosis_Group']]
-#     ACHR[b,'Physician'  := df[1,'Physician']]
-#     ACHR[b,'Weekday'  := df[1,'Weekday']]
-#     ACHR[b,'Month'  := df[1,'Month']]
-#     
-#     
-#     
-# } # loop thru buckets
-# 
-# 
-#   save(ACHR, file=paste0(paste('Threads',TN,new_event_col,sep='+'), '.Rdata'))
-# 
-# 
-# # return the table
-# return(ACHR)
-# }
-
-
-###################################################################
-###################################################################
-# ACHR_batch_clinic_days <- function(occ,TN, EVENT_CFs, ALL_CFs) {
-#   
-#   # Name for column that has events -- three variations
-#   DV1= newColName(CFs[1])
-#   DV2= newColName(CFs[1:2])
-#   DV3= newColName(CFs[1:3])
-#   
-#   print(DV1)
-#   print(DV2)
-#   print(DV3)
-#   
-#   # name for column that defines the buckets
-#   BU = 'Clinic_ymd'
-#   
-#   # first  get the date only 
-#   #    occ$ymd <- format(as.POSIXct(occ$tStamp),"%Y-%m-%d")
-#   
-#   # make new columns as needed for clinic + day and events
-#   #    occ = unite(occ, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
-#   
-#   
-#   
-#   # pick subsets -- one visit at a time in this version, but could be more
-#   bucket_list <- unique(occ[['Clinic_ymd']])
-#   
-#   # get the size (number of buckets)
-#   N = length(bucket_list)
-#   print(N)
-#   
-#   # pre-allocate the data.table.  Tables are supposed to be faster.
-#   ACHR = data.table(bucket=integer(N),
-#                     Clinic_date =  character(N),
-#                     YMD_date = character(N),
-#                     NEvents = integer(N),
-#                     NumVisits = numeric(N),
-#                     A_NetComplexity=double(N),
-#                     AR_NetComplexity=double(N),
-#                     ARW_NetComplexity=double(N),
-#                     A_Nodes=double(N),
-#                     AR_Nodes=double(N),
-#                     ARW_Nodes=double(N),
-#                     A_Edges=double(N),
-#                     AR_Edges=double(N),
-#                     ARW_Edges=double(N),
-#                     A_CompressRatio = double(N),
-#                     AR_CompressRatio = double(N),
-#                     ARW_CompressRatio = double(N),
-#                     A_Entropy = double(N),
-#                     AR_Entropy = double(N),
-#                     ARW_Entropy = double(N),
-#                     Clinic = character(N),
-#                     NumUniqueProcedures = numeric(N),
-#                     NumUniqueDiagnosisGroups  = numeric(N), 
-#                     NumPhysicians  = numeric(N), 
-#                     Weekday  = character(N), 
-#                     Month  = character(N)
-#   )
-#   
-#   # Now add columns for the IVs.  There will be three for each IV
-#   
-#   # Add the IV columns
-#   for (cf in CFs){
-#     
-#     ACHR[, paste0(cf,"_count"):= double(N)]
-#     ACHR[, paste0(cf,"_compression"):= double(N)]
-#     ACHR[, paste0(cf,"_entropy"):= double(N)]
-#     
-#   }
-#   
-#   # loop through the buckets. Result will be data frame with one row per bucket
-#   for (i in 1:N){
-#     
-#     b = i #  as.integer(bucket_list[i])
-#     
-#     # print once every 10 buckets
-#     if (b%%10==0) {print(b)}
-#     
-#     # select the threads that go in this bucket
-#     df = occ[occ[[BU]] ==bucket_list[i] ]
-#     
-#     # bucket number
-#     ACHR[b,bucket := b]
-#     
-#     # length of the thread (number of rows)
-#     ACHR[b,NEvents := nrow(df)]
-#     
-#     # only do the computations if there are more than two occurrences
-#     if (nrow(df) > 2) {
-#       
-#       # compute the duration of the visit in hours
-#       #     ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
-#       
-#       # compressibility of DV
-#       ACHR[b,A_CompressRatio := compression_index(df,DV1)]
-#       ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
-#       ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
-#       
-#       # NetComplexity of DV
-#       # First get the network
-#       n = threads_to_network_original(df,TN, DV1)
-#       ACHR[b,A_NetComplexity := estimate_network_complexity( n )]
-#       ACHR[b,A_Nodes := nrow(n$nodeDF) ]
-#       ACHR[b,A_Edges := nrow(n$edgeDF) ]
-#       
-#       n = threads_to_network_original(df,TN, DV2)
-#       ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
-#       ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
-#       ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
-#       
-#       n = threads_to_network_original(df,TN, DV3)
-#       ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
-#       ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
-#       ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
-#       
-#       # get the entropy for AR and ARW
-#       ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
-#       ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
-#       
-#       
-#       # compute stuff on each context factor
-#       for (cf in CFs){
-#         
-#         # Count the unique elements in each cf
-#         ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ]
-#         
-#         # get the compression
-#         ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
-#         
-#         # get the entropy
-#         ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
-#         
-#       }
-#     } # kf nrows > 2
-#     
-#     # Now copy in the rest of data  
-#     # this works because one visit is one bucket
-#     # count the number of different diagnoses --> typical indicators of complexity
-#     
-#     
-#     ACHR[b,'Clinic_date'  := df[1,'Clinic_ymd']]
-#     ACHR[b,'YMD_date'  := df[1,'ymd']]
-#     ACHR[b,'NumVisits'  :=  length(unique(df[['Visit_ID']]))]
-#     ACHR[b,'Clinic' := df[1,'Clinic']]
-#     ACHR[b,'NumUniqueProcedures' := length(unique(df[['Proc']]))]
-#     ACHR[b,'NumUniqueDiagnosisGroups'  := length(unique(df[['Diagnosis_Group']]))]  
-#     ACHR[b,'NumPhysicians'  := length(unique(df[['Physician']]))]  
-#     ACHR[b,'Weekday'  := df[1,'Weekday']]
-#     ACHR[b,'Month'  := df[1,'Month']]
-#     
-#     
-#     
-#   } # loop thru buckets
-#   
-#   # copy this one for consistency
-#   ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ]
-#   
-#   # return the table
-#   return(ACHR)
-# }
-
-
-
-#  need to bring Role_ID forward when looking at Visit_Role, so we can track residents over time.
-
-# ACHR_batch_threads_no_merge_old <- function(occ,THREAD_CFs, EVENT_CFs, ALL_CFs) {
-#   
-#   library(tidyr)
-#   library(dplyr)
-#   library(ThreadNet)
-#   library(ngram)
-#   library(lubridate)
-#   library(stringr)
-#   library(data.table)
-#   
-#   # Add  columns for combinations of CFs if needed
-#   thread_col = newColName(THREAD_CFs)
-#   TN = thread_col
-#   if  (!(thread_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,THREAD_CFs,thread_col) }
-# 
-#   
-#   new_event_col = newColName(EVENT_CFs)
-#   if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
-#  
-#   all_cf_col = newColName(ALL_CFs)
-#   if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
-#   
-#   # set key on the data.table
-#   setkeyv(occ, TN)
-#   
-#   # get the list of buckets
-#   bucket_list <- unique(occ[[TN]])
-#   
-#   # print the number of buckets
-#   print(paste0('Number of buckets=', length(bucket_list)  ))
-#   
-#   
-#   # make data frame with results
-#  Thrds = data.table( t(sapply( bucket_list, 
-#                            function(b){
-#                              
-#                              # select a subset 
-#                              df= occ[get(TN)==b]
-#                     
-#                             # make sure it  is sorted by timestamp
-#                              df=df[order(df$tStamp),]
-#                              
-#                              # get the network -- only if there are enough rows...
-#                              
-#                               if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
-#                               else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
-#                              
-#                              # compute each parameter and put them in a vector
-#                              c(
-#                                bucket=b,
-#                                Clinic = as.character(df$Clinic[1]),
-#                                Physician  = as.character(df$Physician[1]),
-#                                ymd = as.character(df$ymd[1]),
-#                                Clinic_ymd = as.character(df$Clinic_ymd[1]),
-#                                Weekday  = as.character(df$Weekday[1]),
-#                                Month  = as.character(df$Month[1]),
-#                                Phase =  compute_phase(df$tStamp[1]),
-#                                threadNum = as.numeric(df$threadNum[1]),
-#                                Visit_ID  = as.character(df$Visit_ID[1]),
-#                                Subject_ID  = as.character( df$Subject_ID[1] ),
-#                                Role_ID = as.character( df$Role_ID[1] ),
-#                                NEvents = nrow(df),
-#                                ThreadStart= as.character(df$tStamp[1] ),
-#                                ThreadDuration= compute_thread_duration(df), 
-#                                VisitDuration= compute_visit_duration(df), 
-#                                wait_time1 = compute_wait_time1(df),
-#                                wait_time2 = compute_wait_time2(df),
-#                                Visit_number = as.numeric(df[1,'Visitnum_Total']),
-#                                LOC_CPT =  as.character(df[1,'LOS_CPT']),
-#                                NetComplexity=estimate_network_complexity( n ),
-#                                Nodes=nrow(n$nodeDF),
-#                                Edges=nrow(n$edgeDF),
-#                                CompressRatio = compression_index(df,new_event_col),
-#                                Entropy = compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]),
-#                                NumProcedures = count_procedures(df$Proc[1]),
-#                                NumDiagnoses = count_diagnoses(df$Diag[1]),
-#                                Proc = as.character(df[1,'Proc']),
-#                                Diagnosis = as.character(df[1,'Diag']),    
-#                                Diagnosis_group  = as.character(df[1,'Diagnosis_Group']),
-#                                CF_Alignment = 1,    # make placeholder, but compute below
-#                                ALL_CF_count = length(unique(df[[all_cf_col]])),
-#                                ALL_CF_entropy = compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]), 
-#                              
-#                              # name these columns afterwards
-#                              sapply(ALL_CFs, function(cf){
-#                                c( length(unique(df[[cf]]))
-#                                   # ,
-#                                   # compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) 
-#                                   ) })
-#                              
-#                              )
-#                       } )))
-#  
-#  
-#  # name the last columns -- code has to match above
-#  cn =  as.vector(sapply(ALL_CFs, function(cf){
-#    c( paste0(cf,"_count")
-#     #   ,  
-#     # paste0(cf,"_entropy") 
-#     ) }))
-#  
-#  # now assign them to the last columns
-#  last_col = ncol(Thrds)
-#  first_col= last_col-length(cn)+1
-#  setnames(Thrds, c(first_col:last_col), cn)
-#  
-#  # Compute the alignment of the context factors
-#  Thrds$CF_Alignment =  as.numeric( as.character(Thrds$Action_count)) / as.numeric( as.character(Thrds$ALL_CF_count ))
-#  
-#  save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')
-#  save(Thrds, file=save_file_name)
-#  
-#  print(paste('Saved ', nrow(Thrds), " records in ",save_file_name))
-#  
-#  return(Thrds) 
-#   
-# }
-# 
diff --git a/Data Prep/ACHR_Batch_V3.R b/Data Prep/ACHR_Batch_V3.R
index a404eec..3160953 100644
--- a/Data Prep/ACHR_Batch_V3.R	
+++ b/Data Prep/ACHR_Batch_V3.R	
@@ -281,9 +281,13 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) {
       ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
       ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
       
+      # # get the entropy for AR and ARW
+      # ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
+      # ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
+      
       # get the entropy for AR and ARW
-      ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
-      ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
+      ACHR[b, AR_Entropy  := compute_graph_entropy_TEST(df[[DV2]]) ]
+      ACHR[b, ARW_Entropy  := compute_graph_entropy_TEST(df[[DV3]]) ]
       
       
       # compute stuff on each context factor
@@ -296,7 +300,10 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) {
         ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
         
         # get the entropy
-        ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+        # ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+        
+        # get the graph entropy
+        ACHR[b, paste0(cf,"_entropy") := compute_graph_entropy_TEST(df[[cf]]) ]
         
       }
     } # kf nrows > 2
@@ -442,9 +449,12 @@ for (i in 1:N){
       ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
       ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
       
-      # get the entropy for AR and ARW
-      ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
-      ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
+      # # get the entropy for AR and ARW
+      # ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
+      # ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
+      
+      ACHR[b, AR_Entropy  := compute_graph_entropy_TEST( df[[DV2]] )  ]
+      ACHR[b, ARW_Entropy  := compute_graph_entropy_TEST( df[[DV3]] ) ]
       
     
   # compute stuff on each context factor
@@ -457,8 +467,9 @@ for (i in 1:N){
     ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
 
     # get the entropy
-    ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
-
+    # ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+    ACHR[b, paste0(cf,"_entropy") := compute_graph_entropy_TEST( df[[cf]])  ]
+    
   }
 } # kf nrows > 2
 
@@ -539,3 +550,21 @@ cleanOccBatch <- function(fileRows){
 
 get_timeScale <- function(){'hr'}
 
+# create a function to compute graph entropy
+# It will use the standard Shannon entropy, but it will apply it to the frequencies of edges in the 
+# adjacency matrix.  To do this, we just need to count the 2-grams in the sequence and divide by the total 
+# number of 2-grams to get the probabilities.  You should be able to pass in any sequence. 
+# NOTE: This version will work on one sequence at a time (eg. one patient visit)
+# Needs to be revised to work on groups of visits (e.g,. clinic-days) 
+compute_graph_entropy_TEST <- function(s){
+  
+  # first convert s into text vector
+  text_vector =  concatenate(s) 
+  
+  # get the 2-grams in the sequence s. ng$prop is the proportion of each edge. It sums to 1. 
+  p = get.phrasetable(ngram(text_vector,2))[['prop']]
+ 
+  # return Shannon entropy
+  return(-sum(p*log(p)))
+   
+}
diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R
index 06cd5ad..0bd2cd0 100644
--- a/Data Prep/ACHR_Clinic_Trajectories_v3.R	
+++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R	
@@ -62,6 +62,136 @@ get_bucket <- function(o, b ){
 #
 graph_trajectory  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') {
   
+  # add column for bucket_CF if needed
+  bucket_col = newColName(bucket_CFs)
+  if  (!(bucket_col  %in% colnames(e)))  {  e = combineContextFactors(e,bucket_CFs,bucket_col) }
+  
+  # here is the sorted list of buckets
+  blist = sort(unique(e[[bucket_col]]))
+  
+  # now many buckets?
+  nb = length(blist)
+  # print(blist)
+  print(paste('Number of buckets =',nb))
+  
+  # make data frame for results
+  vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() )
+  
+  bcount=0
+  # scan through the data
+  for (b in blist){
+    bcount= bcount +1
+    # print(paste('b =',b))
+    
+    # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE...
+    # get text vector for the whole data set.  Bucket needs at least 2 threads
+    # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
+    th = e[ e[[bucket_col]] ==b , ] 
+    
+    if (nrow(th)>3) 
+      {
+      ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]    
+    
+    
+    nodes = length(unique(th[[cf]]))
+    
+    
+     # print(paste('nrow ngdf =',nrow(ngdf)))
+    
+    # add the bucket number and name
+    ngdf$bid = bcount
+    ngdf$bname = b
+    
+    # append the columns to the end
+    # vt is the whole set of all ngrams in all the windows
+    vt=rbind(vt,ngdf)
+    }
+    }
+  
+  
+  # convert to factor
+  vt$ngrams = factor(vt$ngrams)
+  
+  # nWindows = length(unique(vt$bid))
+  nWindows = nb
+  #print(paste('nWindows =',nWindows))
+  
+  # get the set of unique ngrams for the whole data set
+  vt_unique = data.frame(ngrams=unique(vt$ngrams))
+  #print(vt_unique)
+  
+  # put the results here
+  windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique))
+  
+  for (i in 1:nWindows){
+    
+    # get the merged listunqi
+    vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE)
+    
+    # use the bid.y to get the whole vector, but replace the NA with zeros
+    bb=vtmerge[vtmerge$bid==i,'freq']
+    bb[is.na(bb)] <- 0
+    
+    windowFreqMatrix[i,]=bb
+    
+    print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0)))
+    
+  }
+  
+  # #Make a matrix to filter by where all the entries are 0 < f < 1
+  # fm = windowFreqMatrix/max(windowFreqMatrix)
+  # 
+  # # if the number if below the threshold, set it to zero
+  # windowFreqMatrix[fm<=filter_threshold] = 0
+  
+  df = data.table(
+    Clinic_ymd = character(nWindows),
+    Clinic = character(nWindows),
+    # Role_ymd = character(nWindows),
+    # Role = character(nWindows),
+    ymd = character(nWindows),
+    pct_retained = double(nWindows),
+    complexity = double(nWindows),
+    Dist_from_reference = double(nWindows),
+    Dist_from_next = double(nWindows)
+  )
+  
+  for (i in 1:(nWindows-1) ) {
+    
+    df[i,  Clinic_ymd := blist[i] ]
+    df[i,  Clinic := unlist(strsplit(blist[i],'_',fixed=TRUE))[1] ]
+    df[i,  ymd :=    unlist(strsplit(blist[i],'_',fixed=TRUE))[2] ]
+
+    # df[i,  Role_ymd := as.character(blist[i]) ]
+    # df[i,  Role := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[1] ]
+    # df[i,  ymd :=  unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[2] ]
+    
+    df[i,  pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ]
+    df[i,  complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ]
+    df[i,  Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ]
+    df[i,  Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ]
+  }
+  
+  
+  # get the ngram data and labels
+  if (keep_ngram_vectors) {
+    b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),])
+    colnames(b_df)=vt_unique$ngrams
+    
+    # stick the ngram frequencies on the end for good measure
+    df = cbind(df,b_df)
+  }
+  
+  # save the result and return it, too
+  save(df, file=paste0(save_file_name,'.rData') )
+  
+  return(df)
+  
+}
+##################################################################################
+
+graph_trajectory_ROLES  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') {
+  
   # add column for bucket_CF if needed
   bucket_col = newColName(bucket_CFs)
   if  (!(bucket_col  %in% colnames(e)))  {  e = combineContextFactors(occ,bucket_CFs,bucket_col) }
@@ -142,8 +272,10 @@ graph_trajectory  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1,
     # windowFreqMatrix[fm<=filter_threshold] = 0
   
   df = data.table(
-    Clinic_ymd = character(nWindows),
-    Clinic = character(nWindows),
+    # Clinic_ymd = character(nWindows),
+    # Clinic = character(nWindows),
+    Role_ymd = character(nWindows),
+    Role = character(nWindows),
     ymd = character(nWindows),
     pct_retained = double(nWindows),
     complexity = double(nWindows),
@@ -153,10 +285,16 @@ graph_trajectory  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1,
 
    for (i in 1:(nWindows-1) ) {
 
-     df[i,  Clinic_ymd := blist[i] ]
-     df[i,  Clinic := unlist(strsplit(blist[i],'_'))[1] ]
-     df[i,  ymd :=  unlist(strsplit(blist[i],'_'))[2] ]
-     df[i,  pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ]
+     # df[i,  Clinic_ymd := blist[i] ]
+     # df[i,  Clinic := unlist(strsplit(blist[i],'_'))[1] ]
+     # df[i,  ymd :=  unlist(strsplit(blist[i],'_'))[2] ]
+
+     df[i,  Role_ymd := as.character(blist[i]) ]
+     df[i,  Role := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[1] ]
+     df[i,  ymd :=  unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[2] ]
+     
+     
+          df[i,  pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ]
      df[i,  complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ]
      df[i,  Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ]
      df[i,  Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ]
diff --git a/Data Prep/Comparison_functions.R b/Data Prep/Comparison_functions.R
index 7d94256..4c962f9 100644
--- a/Data Prep/Comparison_functions.R	
+++ b/Data Prep/Comparison_functions.R	
@@ -31,3 +31,20 @@ get_range <- function(o, start_date, end_date){
   o %>% filter(as.Date(tStamp) > start_date & as.Date(tStamp) < end_date)
   
 }
+
+
+#  get start dates -- just a place to stick some code
+# April 2019
+
+get_start_dates <- function(){
+
+  # first get the roles that are relativeluy frequent
+  role_freq2 = as.data.frame(table(ot$Role_ID)[table(ot$Role_ID)>1000])
+  
+  # then get their start dates
+role_freq2$start_date = sapply(role_freq2$Var1, function(x) {min(as.character(ot$tStamp[ot$Role_ID==as.character(x)]))})
+
+  # then trim it so it's more readable. 
+role_freq2$start_date = substr(role_freq2$start_date,1,10)
+
+}
\ No newline at end of file

From 143ce33c482b3f7a15d532193ccfee12e81cc3c6 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 18 Apr 2019 21:26:20 -0400
Subject: [PATCH 27/31] ignore

---
 .gitignore | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.gitignore b/.gitignore
index 496052d..4859490 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,16 @@ emr+Visit_ID.Rdata
 Thrds+threadNum+Action_Role_Workstation.Rdata
 Nov 25 am workspace.RData
 deleteme.rData
+.RDataTmp
+CDT_435_1.rData
+MW2_5 Edges.png
+VRThrds+threadNum+Action.Rdata
+Visits_and_cds_435_for_inkyu.rdata
+cd_role_trajectory.rData
+cd_role_trajectory1.rData
+cd_role_trajectory2.rData
+cd_wk_trajectory1.rData
+cd_wk_trajectory2.rData
+cds+Clinic_ymd+Action_Role_Workstation.Rdata
+ot-otr-vrthreads-for-inkyu.rdata
+trajectory2grams.rData

From 3ccce6ebdc28a3f701ddb26154f1cb981490a584 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Mon, 6 May 2019 11:17:03 -0400
Subject: [PATCH 28/31] new version of get-nrams-for-buckets

---
 Data Prep/ACHR_Batch_Aggregate_Data.R   | 14 ++++
 Data Prep/ACHR_Clinic_Trajectories_v3.R | 96 +++++++++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index 8accb84..a300ce0 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -697,3 +697,17 @@ compute_graph_entropy <- function(s){
   
 }
 
+get_action_frequencies <- function(s,n){
+  
+  # guard against insufficient data...
+  if (length(s) < n) return(0)
+  
+  # convert s into text vector
+  text_vector =  concatenate(s) 
+  
+  # get the 2-grams in the sequence s. 'freq' is the frequency
+ return( get.phrasetable(ngram(text_vector, n )) )
+  
+}
+
+
diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R
index 0bd2cd0..56376e1 100644
--- a/Data Prep/ACHR_Clinic_Trajectories_v3.R	
+++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R	
@@ -317,6 +317,102 @@ graph_trajectory_ROLES  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_
 
 }
 ##################################################################################
+# e = otr, needs to be set up to include threads that are visit_ID_Role 
+# there must be a 'threadNum" assigned in each row
+#
+# cf is the context factor we want to follow, such as 'Action'
+
+
+get_ngrams_for_buckets  <- function(e,  cf, n_gram_size=2, filter_threshold=2,  save_file_name='deleteme') {
+  
+  
+  
+  # here is the sorted list of buckets
+  blist = sort(unique(e$threadNum))
+  
+  
+  # now many buckets?
+  nb = length(blist)
+  # print(blist)
+  print(paste('Number of buckets =',nb))
+  
+  # make data frame for results
+  vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() )
+  
+  bcount=0
+  # scan through the data
+  for (b in blist){
+    bcount= bcount +1
+    # print(paste('b =',b))
+    
+ 
+    # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
+    th = e[ e[['threadNum']] ==b , ] 
+    
+    if (nrow(th)>filter_threshold) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]    
+    
+    # add the bucket number and name
+    ngdf$bid = bcount
+    ngdf$bname = b
+    
+    # append the columns to the end
+    # vt is the whole set of all ngrams in all the windows
+    vt=rbind(vt,ngdf)
+    }
+  }
+  
+  
+  # convert to factor
+  vt$ngrams = factor(vt$ngrams)
+  
+  # nWindows = length(unique(vt$bid))
+  nWindows = nb
+  #print(paste('nWindows =',nWindows))
+  
+  # get the set of unique ngrams for the whole data set
+  vt_unique = data.frame(ngrams=unique(vt$ngrams))
+  #print(vt_unique)
+  
+  # put the results here
+  windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique))
+  
+  for (i in 1:nWindows){
+    
+    # get the merged listunqi
+    vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE)
+    
+    # use the bid.y to get the whole vector, but replace the NA with zeros
+    bb=vtmerge[vtmerge$bid==i,'freq']
+    bb[is.na(bb)] <- 0
+    
+    windowFreqMatrix[i,]=bb
+    
+    print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0)))
+    
+  }
+  
+
+  
+  # get the ngram data and labels
+    # convert matrix to data frame
+ #   df=as.data.frame(windowFreqMatrix[1:(nWindows-1),])
+    df=as.data.frame(windowFreqMatrix)
+    
+    # add the columnnames
+    colnames(df)=vt_unique$ngrams
+    
+    # add the bucket names
+     df = cbind(as.data.frame(blist), df)
+     colnames(df)[1] <- c("threadNum")
+  
+     
+  
+  # save the result and return it, too
+  #save(df, file=paste0(save_file_name,'.rData') )
+  
+  return(df)
+  
+}
 
 ##################################################################################
 graph_trajectory_week  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') {

From 6aa02ba4c263c45f506abf519a43078a821d40f0 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Tue, 4 Jun 2019 12:14:32 -0400
Subject: [PATCH 29/31] added notes for ortho/peds changes

---
 Data Prep/ACHR_Batch_Aggregate_Data.R | 24 +++++++++++++++++++++++-
 Data Prep/ACHR_Batch_PreProcess.R     |  6 ++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index a300ce0..4a5416e 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -92,7 +92,12 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) {
                     ALL_CF_count =  integer(N),
                     ALL_CF_entropy = double(N),
                     CF_Alignment = double(N) )
-
+  
+  #################################################
+  # add columns for time of each role (or something)
+ 
+  
+  
     # Now add columns for the CF counts.  
     for (cf in ALL_CFs){
       ACHR[, paste0(cf,"_count"):= double(N)]
@@ -135,6 +140,12 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) {
     # ACHR[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ]
     ACHR[b,ALL_CF_entropy := compute_graph_entropy( df[[all_cf_col]])  ] 
     
+    #################################################
+    # Compute time of each role 
+    # need to difftime start-end of each thread -- compute_thread_duration
+    # sum up the role chunks for each role within each thread
+    
+    
     # Count the unique elements in each cf  
     for (cf in ALL_CFs){  ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ] }
   } 
@@ -160,6 +171,7 @@ ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) {
 #################################################################################
 # streamlined version for threads that are PART OF A VISIT.  So you can merge with visits
 # to get the visit context.  Just compute the minimum for speed.
+# Needs to pass in the visit data to merge
 ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) {
   
   library(tidyr)
@@ -216,6 +228,9 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) {
                     EntropyVR = double(N),
                     ALL_CF_countVR =  integer(N),
                     ALL_CF_entropyVR = double(N) )
+
+    #################################################
+  # add columns for time of each role (or something)
   
   # Now add columns for the CF counts.  
   for (cf in ALL_CFs){
@@ -255,6 +270,12 @@ ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) {
     # ACHR[b,ALL_CF_entropyVR := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
     ACHR[b,ALL_CF_entropyVR := compute_graph_entropy( df[[all_cf_col]])  ] 
     
+    #################################################
+    # Compute time of each role 
+    # need to difftime start-end of each thread -- compute_thread_duration
+    # sum up the role chunks for each role within each thread
+    
+    
     # Count the unique elements in each cf  
     for (cf in ALL_CFs){  ACHR[b, paste0(cf,"_countVR") :=  length(unique(df[[cf]])) ] }
   } 
@@ -370,6 +391,7 @@ ACHR_batch_visit_add_columns <- function(occ, EVENT_CFs, visits) {
 }
 ##################################################################################
 # Helper functions
+#################################################################################
 compute_phase <- function(t){
   
   t=as.Date(t)
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
index 3b46176..96800d3 100644
--- a/Data Prep/ACHR_Batch_PreProcess.R	
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -28,7 +28,8 @@ redo_ACHR_data_from_scratch  <- function(fname){
   
   # Pick you point of view, or multiple POV...
   THREAD_CF = c('Visit_ID')
-  EVENT_CF = c('Action','Role','Workstation')
+ # EVENT_CF = c('Action','Role','Workstation')
+  EVENT_CF = c('Action')
   ALL_CF = c('Action','Role','Workstation')
   
   #  first read the data
@@ -41,7 +42,8 @@ redo_ACHR_data_from_scratch  <- function(fname){
   # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role)
   # make two versions -- one for whole visits and one for visit_ID_Role
   ot = thread_occurrences( o, 'Visit_ID' ,fname )   # nrow(ot)= 57835
-  otr = thread_occurrences( o, c('Visit_ID','Role') ,fname ) # nrow(otr) = 527666
+ # otr = thread_occurrences( o, c('Visit_ID','Role') ,fname ) # nrow(otr) = 527666
+  otrw = thread_occurrences( o, c('Visit_ID','Role','Workstation') ,fname ) # nrow(otrw) > 527666
   
   
   #  Now aggregate by clinic-day for  each POV

From 9ffb0407907049d7e1d95d25abc0c42d58e7495b Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Wed, 7 Aug 2019 10:21:13 -0400
Subject: [PATCH 30/31] Added new procedures to measure all the multiple
 networks

---
 Data Prep/ACHR_Batch_Aggregate_Data.R |   2 +-
 Data Prep/ACHR_Batch_V3.R             | 298 +++++++++++++++++++++++---
 2 files changed, 273 insertions(+), 27 deletions(-)

diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
index 4a5416e..3e11d1d 100644
--- a/Data Prep/ACHR_Batch_Aggregate_Data.R	
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -535,7 +535,7 @@ count_procedures <- function( p ){
   if (is.na( p )) return( 0 )
   
   # get the overall number of items
-  total_num = str_count( p, '#@#')
+  total_num = str_count( p, '#@#')+1
   
   # adjust for the visit codes that are not actual procedures
   v=c('99211','99212','99213','99214','99215','99201','99202','99203','99204','99205')
diff --git a/Data Prep/ACHR_Batch_V3.R b/Data Prep/ACHR_Batch_V3.R
index 3160953..3cfa233 100644
--- a/Data Prep/ACHR_Batch_V3.R	
+++ b/Data Prep/ACHR_Batch_V3.R	
@@ -271,15 +271,15 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) {
       ACHR[b,A_Nodes := nrow(n$nodeDF) ]
       ACHR[b,A_Edges := nrow(n$edgeDF) ]
       
-      n = threads_to_network_original(df,TN, DV2)
-      ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
-      
-      n = threads_to_network_original(df,TN, DV3)
-      ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
+      # n = threads_to_network_original(df,TN, DV2)
+      # ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
+      # ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
+      # ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
+      # 
+      # n = threads_to_network_original(df,TN, DV3)
+      # ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
+      # ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
+      # ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
       
       # # get the entropy for AR and ARW
       # ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
@@ -334,8 +334,10 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) {
   return(ACHR)
 }
 
-###################################################################
-###################################################################
+##################################################################
+##################################################################
+##################################################################
+##################################################################
 
 
 
@@ -380,6 +382,7 @@ ACHR = data.table(bucket=integer(N),
                   A_Entropy = double(N),
                   AR_Entropy = double(N),
                   ARW_Entropy = double(N),
+                  CF_Alignment = double(N), 
                   Visit_ID  = character(N), 
                   Subject_ID  = character(N), 
                   Clinic = character(N),
@@ -427,8 +430,8 @@ for (i in 1:N){
      
     # compressibility of DV
       ACHR[b,A_CompressRatio := compression_index(df,DV1)]
-      ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
-      ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
+      # ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
+      # ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
       
     # NetComplexity of DV
     # First get the network
@@ -439,23 +442,23 @@ for (i in 1:N){
       ACHR[b,A_Nodes := nrow(n$nodeDF) ]
       ACHR[b,A_Edges := nrow(n$edgeDF) ]
       
-      n = threads_to_network_original(df,TN, DV2)
-      ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
-      
-      n = threads_to_network_original(df,TN, DV3)
-      ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
+      # n = threads_to_network_original(df,TN, DV2)
+      # ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
+      # ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
+      # ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
+      # 
+       n = threads_to_network_original(df,TN, DV3)
+       ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
+       ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
+       ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
       
       # # get the entropy for AR and ARW
       # ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
       # ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
       
-      ACHR[b, AR_Entropy  := compute_graph_entropy_TEST( df[[DV2]] )  ]
-      ACHR[b, ARW_Entropy  := compute_graph_entropy_TEST( df[[DV3]] ) ]
-      
+      # ACHR[b, AR_Entropy  := compute_graph_entropy_TEST( df[[DV2]] )  ]
+       ACHR[b, ARW_Entropy  := compute_graph_entropy_TEST( df[[DV3]] ) ]
+      # 
     
   # compute stuff on each context factor
   for (cf in CFs){
@@ -491,6 +494,8 @@ for (i in 1:N){
     ACHR[b,'Weekday'  := df[1,'Weekday']]
     ACHR[b,'Month'  := df[1,'Month']]
     
+    # Compute the alignment of the context factors
+    ACHR[b, 'CF_Alignment' :=    ACHR[b,Action_count] / ACHR[b,ARW_Nodes] ]
     
     
 } # loop thru buckets
@@ -502,6 +507,168 @@ ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ]
 return(ACHR)
 }
 
+#####################   ############################################################
+#####################   ############################################################
+#####################   ############################################################
+
+
+# ACHR stands for Antecedents of Complexity in Healthcare Routines.  
+# This is function is set up to compute process parameters on thousands of patient visits.
+# This version assumes that CFs = c('Action','Role','Workstation')
+ACHR_batch_visits_all_CFs <- function(occ,TN, CFs) {
+  
+  # Get list of columns we want to use/create
+  dv1=newColName(CFs[1])
+  dv2=newColName(CFs[2])
+  dv3=newColName(CFs[3])
+  dv4=newColName(CFs[2:3])
+  dv5=newColName(CFs[1:2])
+  dv6=newColName(CFs[1:3])
+  
+  # need to make new columns
+  occ = combineContextFactors(occ,CFs[2:3], newColName(CFs[2:3]))
+  occ = combineContextFactors(occ,CFs[1:2], newColName(CFs[1:2]))
+  occ = combineContextFactors(occ,CFs[1:3], newColName(CFs[1:3]))
+  
+  # We  will  loop thru this list below
+  dv_list = c(dv1, dv2, dv3, dv4, dv5, dv6)
+  print(dv_list)
+ 
+  # set key on the data.table for the threadNum to speed retrival
+  occ=as.data.table(occ)
+  setkeyv(occ, TN)
+  
+  # pick subsets -- one visit at a time in this version, but could be more
+  bucket_list <- make_buckets_1(occ, 'threadNum')
+  
+  # get the size (number of buckets)
+  N = length(bucket_list)
+  # print(bucket_list)
+  
+  
+  # pre-allocate the data.table.  Tables are supposed to be faster.
+  ACHR = data.table(bucket=integer(N),
+                    NEvents = integer(N),
+                    VisitStart= character(N),
+                    VisitDuration=double(N),   
+                    CF_Alignment = double(N), 
+                    Visit_ID  = character(N), 
+                    Subject_ID  = character(N), 
+                    Clinic = character(N),
+                    LOS_CPT = character(N),
+                    NumProcedures = double(N),
+                    NumDiagnoses = double(N),
+                    Procedure = character(N),
+                    Diagnosis = character(N),   # diag in the raw data
+                    Diagnosis_group  = character(N), 
+                    Physician  = character(N), 
+                    Weekday  = character(N), 
+                    Month  = character(N)
+  )
+  
+  # Now add columns for each dv. 
+  for (dv in dv_list){
+    
+    ACHR[, paste0(dv,"_nodes"):= double(N)]
+    ACHR[, paste0(dv,"_edges"):= double(N)]
+    ACHR[, paste0(dv,"_complexity"):= double(N)]
+    ACHR[, paste0(dv,"_entropy"):= double(N)]
+    
+  }
+  
+  # loop through the buckets. Result will be data frame with one row per bucket
+  for (i in seq(1,N,1)){
+    
+    b = i #  as.integer(bucket_list[i])
+    
+    # bucket number
+    ACHR[b,bucket := b]
+    
+    # print once every 100 visits
+    # print(b)
+    # print (bucket_list[i])
+    
+    # save data every 1000 visits
+    if (b%%1000==0) {
+      print(b)
+      ACHR_partial = ACHR[ACHR$NEvents>1,]
+      save(ACHR_partial, file=paste0("visits_ARW_",nrow(ACHR_partial),'.Rdata'))}
+    
+    # select a subset 
+    df= subset(occ,  threadNum==bucket_list[i] )
+    
+    # only do the computations if there are more than two occurrences
+    if (nrow(df) > 2) {
+      
+    # make sure it is sorted by timestamp
+    df=df[order(df$tStamp),]
+  
+  
+    # length of the thread (number of rows)
+    ACHR[b,NEvents := nrow(df)]
+    # print( nrow(df) )
+    
+      
+      # compute the duration of the visit in hours
+      ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
+   
+      # compute stuff on each DV in the list
+      for (dv in dv_list){
+        
+        n = threads_to_network_local(df,TN, dv)
+        
+        ACHR[b,paste0(dv,"_complexity") := estimate_network_complexity( n )]
+        ACHR[b,paste0(dv,"_nodes") := nrow(n$nodeDF) ]
+        ACHR[b,paste0(dv,"_edges") := nrow(n$edgeDF) ]  
+        ACHR[b, paste0(dv,"_entropy")  := compute_graph_entropy( df[[dv]] ) ]
+         
+        # print (dv)
+        # print( nrow(n$nodeDF) )
+        # print( length(unique(df[[dv]])))
+        
+      }
+    } # df nrows > 3
+    
+    # Now copy in the rest of data  
+    # this only works because one visit is one bucket
+    ACHR[b,'VisitStart'  :=   as.character(df[1,'tStamp']) ]
+
+    ACHR[b,Visit_ID  := df[1,'Visit_ID']]
+    ACHR[b,Subject_ID  := df[1,'Subject_ID']]
+    ACHR[b,Clinic := df[1,'Clinic']]
+    ACHR[b,LOS_CPT := df[1,'LOS_CPT']]
+    ACHR[b,Procedure := df[1,'Proc']]
+    ACHR[b,Diagnosis := df[1,'Diag']]
+    ACHR[b,NumProcedures := count_procedures(df$Proc[1]) ]
+    ACHR[b,NumDiagnoses := count_diagnoses(df$Diag[1]) ]
+    ACHR[b,Diagnosis_group  := df[1,'Diagnosis_Group']]
+    ACHR[b,Physician  := df[1,'Physician']]
+    ACHR[b,Weekday  := df[1,'Weekday']]
+    ACHR[b,Month  := df[1,'Month']]
+    
+    # Compute the alignment of the context factors
+    # This assumes   CFs = c('Action','Role','Workstation')
+    # print(  ACHR[b,Action_nodes]  )
+    # print(  ACHR[b,Action_Role_Workstation_nodes]  )
+    ACHR[b, 'CF_Alignment' :=   ACHR[b,Action_nodes]  / ACHR[b,Action_Role_Workstation_nodes]  ]
+  
+  } # loop thru buckets
+  
+  # save the  data one last time
+  save(ACHR, file=paste0("visits_ARW_",b,'.Rdata'))
+  
+  # return the table
+  return(ACHR)
+}
+
+
+#####################   ############################################################
+#####################   ############################################################
+#####################   ############################################################
+#####################   ############################################################
+#####################   ############################################################
+#####################   ############################################################
+
 
 # Each bucket is a list of thread numbers that can be used to subset the list of occurrences
 make_buckets_1 <- function(o, criteria){
@@ -559,7 +726,10 @@ get_timeScale <- function(){'hr'}
 compute_graph_entropy_TEST <- function(s){
   
   # first convert s into text vector
-  text_vector =  concatenate(s) 
+  text_vector =  long_enough( concatenate(s) , 2, ' ')
+ # text_vector =   concatenate(s)  
+  
+ #  if  (length(text_vector)<2) {return(0)}
   
   # get the 2-grams in the sequence s. ng$prop is the proportion of each edge. It sums to 1. 
   p = get.phrasetable(ngram(text_vector,2))[['prop']]
@@ -568,3 +738,79 @@ compute_graph_entropy_TEST <- function(s){
   return(-sum(p*log(p)))
    
 }
+# to avoid errors in count_ngrams, make sure the length of each thread in the text_vector tv is longer than the n-gram size, n
+# this gets used in various places so need to pass in the delimiter
+long_enough = function(tv,n,delimiter){
+  
+  return(tv[ unlist(lapply(1:length(tv), function(i) {length(unlist(strsplit(tv[[i]],delimiter)))>=n})) ])
+  
+}
+
+threads_to_network_local <- function(et,TN,CF,grp='threadNum'){
+  
+  # print(head(et))
+  #
+  # print(paste('CF=', CF))
+  # print(paste('grp=', grp))
+  
+  # First get the node names & remove the spaces
+  node_label = levels(factor(et[[CF]]))  # unique(et[[CF]])
+  node_label=str_replace_all(node_label," ","_")
+  nNodes = length(node_label)
+  
+  # print("node_label")
+  # print(node_label)
+  # print(paste('nNodes=', nNodes))
+  
+  node_group=character()
+  for (n in 1:nNodes){
+    # hardcoded threadNum for data.table syntax
+    node_group = c(node_group, as.character(unlist( et[which(et[[CF]]==node_label[n]),threadNum][1]) ) )
+  }
+  
+  # set up the data frames we need to draw the network
+  nodes = data.frame(
+    id = 1:length(node_label),
+    label = node_label,
+    Group = node_group,
+    title=node_label)
+  
+  # get the 2 grams for the edges
+  ngdf = count_ngrams(et,TN, CF, 2)
+  
+  # Adjust the frequency of the edges to 0-1 range
+  ngdf$freq = round(ngdf$freq/max(ngdf$freq),3)
+  
+  # need to split 2-grams into from and to
+  from_to_str = str_split(str_trim(ngdf$ngrams), " ", n=2)
+  
+  # need to find a better way to do this...
+  nEdges = length(from_to_str)
+  # from_labels=matrix(data="", nrow=nEdges,ncol=1)
+  # to_labels =matrix(data="", nrow=nEdges,ncol=1)
+  # from=integer(nEdges)
+  # to=integer(nEdges)
+  # for (i in 1:length(from_to_str)){
+  #   
+  #   # Get from and to by spliting the 2-gram
+  #   from_labels[i] = str_split(from_to_str[[i]]," ")[1]
+  #   to_labels[i] = str_split(from_to_str[[i]]," ")[2]
+  #   
+  #   # use match to lookup the nodeID from the label...
+  #   from[i] = match(from_labels[i], nodes$label)
+  #   to[i] = match(to_labels[i], nodes$label)
+  # }
+  # 
+  # Stopped filtering out selfies July 20, 2019 for  Kerstin Sailer bug  report
+  edges = data.frame(
+    # from,
+    # to,
+    label = ngdf$freq,
+    Value =ngdf$freq) # %>% filter(!from==to)
+  
+  # print(paste("T2N nodes:",nodes))
+  # print(paste("ngdf = :",ngdf))
+  # print(paste("edges= :",edges))
+  
+  return(list(nodeDF = nodes, edgeDF = edges))
+}

From a5bb511507815fbb3f57ca6c204f062c897c6333 Mon Sep 17 00:00:00 2001
From: Brian Pentland <pentlan2@msu.edu>
Date: Thu, 29 Aug 2019 14:40:01 -0400
Subject: [PATCH 31/31] bug  fix

---
 Data Prep/ACHR_Batch_V3.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Data Prep/ACHR_Batch_V3.R b/Data Prep/ACHR_Batch_V3.R
index 3cfa233..bb80ee7 100644
--- a/Data Prep/ACHR_Batch_V3.R	
+++ b/Data Prep/ACHR_Batch_V3.R	
@@ -577,7 +577,7 @@ ACHR_batch_visits_all_CFs <- function(occ,TN, CFs) {
   }
   
   # loop through the buckets. Result will be data frame with one row per bucket
-  for (i in seq(1,N,1)){
+  for (i in seq(N,1,-1)){
     
     b = i #  as.integer(bucket_list[i])