diff --git a/.gitignore b/.gitignore
index dd0b92d..4859490 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,25 @@ vignettes/*.pdf
 rsconnect/
 >>>>>>> be0331537859f5addf650269855aa0a9d49ca5d5
 .Rproj.user
+emrt+Visit_ID+Role_Action.Rdata
+Thrds+threadNum+Action.Rdata
+Threads+threadNum+Action.Rdata
+Threads+threadNum+Action_Role.Rdata
+emrt+Visit_ID+Role_Action_Workstation.Rdata
+emr+Visit_ID.Rdata
+Thrds+threadNum+Action_Role_Workstation.Rdata
+Nov 25 am workspace.RData
+deleteme.rData
+.RDataTmp
+CDT_435_1.rData
+MW2_5 Edges.png
+VRThrds+threadNum+Action.Rdata
+Visits_and_cds_435_for_inkyu.rdata
+cd_role_trajectory.rData
+cd_role_trajectory1.rData
+cd_role_trajectory2.rData
+cd_wk_trajectory1.rData
+cd_wk_trajectory2.rData
+cds+Clinic_ymd+Action_Role_Workstation.Rdata
+ot-otr-vrthreads-for-inkyu.rdata
+trajectory2grams.rData
diff --git a/Data Prep/.gitignore b/Data Prep/.gitignore
new file mode 100644
index 0000000..e69de29
diff --git a/Data Prep/ACHR_Batch_Aggregate_Data.R b/Data Prep/ACHR_Batch_Aggregate_Data.R
new file mode 100644
index 0000000..3e11d1d
--- /dev/null
+++ b/Data Prep/ACHR_Batch_Aggregate_Data.R	
@@ -0,0 +1,735 @@
+##########################################################################################################
+# THREADNET:  Batch processing for larger data sets
+#
+#  ACHR_Batch_Aggregate_Data
+#
+# (c) 2018 Michigan State University. This software may be used according to the terms provided in the
+# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
+# Brian Pentland
+# Absolutely no warranty!
+##########################################################################################################
+
+
+# Two  functions to aggregate the occurrences 
+# 1) By thread (usually visit, but could be visit_role)
+# 2) By Clinic_day
+  
+# Rewrite for speed: 
+# a) go back to for loop, but just for the stuff that is computed based on the df
+# b) grab first row of each thread seqNum ==1 from the table
+# c) merge them back  together
+# d) Probably need to do different versions for visit vs. visit_role because a lot of the parameters make no sense...
+
+# April 2019 Added  compute_graph_entropy
+
+###################################################################
+###################################################################
+# This is function is set up to aggregate the occurrences within threads -- typically visits
+# occ = pre-processed threaded occurrences
+# TN = threadNum in most cases
+# CFs can be chosen
+# event_CFs define changes within threads
+# ALL_CFs are used to compute the CF_alignment
+
+
+
+ACHR_batch_threads <- function(occ, EVENT_CFs, ALL_CFs) {
+  
+  library(tidyr)
+  library(dplyr)
+  library(ThreadNet)
+  library(ngram)
+  library(lubridate)
+  library(stringr)
+  library(data.table)
+  
+  
+  # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic
+  firstOcc=occ[occ$seqNum==1,]
+  
+  # always used threadNum for the TN
+  TN = 'threadNum'
+  
+  # Add  columns for combinations of CFs if needed
+  # don't need to add new column for the thread_CFs.  That one has to be here.
+  # thread_col = newColName(THREAD_CFs)
+  # TN = thread_col
+  # if  (!(thread_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,THREAD_CFs,thread_col) }
+  
+  new_event_col = newColName(EVENT_CFs)
+  if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+  
+  all_cf_col = newColName(ALL_CFs)
+  if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
+  
+  # set key on the data.table for the threadNum
+  occ=as.data.table(occ)
+  setkeyv(occ, TN)
+  
+  # get the list of buckets
+  bucket_list <- sort( unique(occ[[TN]]) )
+  
+  # get the number of buckets
+  N = length(bucket_list)
+  print(paste0('Number of buckets=', N  ))
+  
+  # allocate the data table with N rows, then for speed, we use := to update it.
+  ACHR = data.table(threadNum =  integer(N),
+                    Phase  = character(N), 
+                    NEvents = integer(N),
+                    ThreadDuration =double(N),
+                    VisitDuration= double(N),
+                    wait_time1 = double(N),
+                    wait_time2 = double(N),
+                    LOS =double(N),
+                    NetComplexity=double(N),
+                    Nodes=double(N),
+                    Edges=double(N),
+                    CompressRatio = double(N),
+                    Entropy = double(N),
+                    NumProcedures = double(N),
+                    NumDiagnoses = double(N),
+                    ALL_CF_count =  integer(N),
+                    ALL_CF_entropy = double(N),
+                    CF_Alignment = double(N) )
+  
+  #################################################
+  # add columns for time of each role (or something)
+ 
+  
+  
+    # Now add columns for the CF counts.  
+    for (cf in ALL_CFs){
+      ACHR[, paste0(cf,"_count"):= double(N)]
+      # ACHR[, paste0(cf,"_compression"):= double(N)]
+      # ACHR[, paste0(cf,"_entropy"):= double(N)]
+    }
+  
+  # update data table with results - only compute the ones that require looking at the whole thread
+  for (b in 1:N){
+    
+    if (b %% 1000 ==0) print(paste0('Thread count =  ',b))
+    
+    # select a subset 
+    df= occ[get(TN)==bucket_list[b]]
+    
+    # make sure it is sorted by timestamp
+    df=df[order(df$tStamp),]
+    
+    # get the network -- only if there are enough rows...
+    if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+    else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
+    
+    # compute each parameter and update the table
+    ACHR[b,threadNum := bucket_list[b] ]
+    ACHR[b,Phase :=  compute_phase(df$tStamp[1])]
+    ACHR[b,NEvents := nrow(df) ]
+    ACHR[b,ThreadDuration:= compute_thread_duration(df) ] 
+    ACHR[b,VisitDuration:= compute_visit_duration(df) ] 
+    ACHR[b, wait_time1 := compute_wait_time1(df) ]
+    ACHR[b, wait_time2 := compute_wait_time2(df) ]
+    ACHR[b, NetComplexity:=estimate_network_complexity( n ) ]
+    ACHR[b, Nodes:=nrow(n$nodeDF) ]
+    ACHR[b,Edges:=nrow(n$edgeDF) ]
+    ACHR[b,CompressRatio := compression_index(df,new_event_col) ]
+  #  ACHR[b,Entropy := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+    ACHR[b,Entropy := compute_graph_entropy( df[[new_event_col]]) ]
+    ACHR[b,NumProcedures := count_procedures(df$Proc[1]) ]
+    ACHR[b,NumDiagnoses := count_diagnoses(df$Diag[1]) ]
+    ACHR[b,ALL_CF_count := length(unique(df[[all_cf_col]])) ]
+    # ACHR[b,ALL_CF_entropy := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ]
+    ACHR[b,ALL_CF_entropy := compute_graph_entropy( df[[all_cf_col]])  ] 
+    
+    #################################################
+    # Compute time of each role 
+    # need to difftime start-end of each thread -- compute_thread_duration
+    # sum up the role chunks for each role within each thread
+    
+    
+    # Count the unique elements in each cf  
+    for (cf in ALL_CFs){  ACHR[b, paste0(cf,"_count") :=  length(unique(df[[cf]])) ] }
+  } 
+  
+  # Compute the alignment of the context factors
+  ACHR$CF_Alignment =  as.numeric( as.character(ACHR$Action_count)) / as.numeric( as.character(ACHR$ALL_CF_count ))
+  
+  # convert level of service to 1-5 integer 
+  ACHR$LOS = convert_LOS( firstOcc$LOS_CPT )
+  
+  # Merge the results with the first row from each thread
+  print('Merging results...')
+  Thrds=merge(x=ACHR, y=firstOcc, by.x='threadNum', by.y ='threadNum' ,all=TRUE) 
+  
+  save_file_name = paste0(paste('Thrds',TN,new_event_col,sep='+'), '.Rdata')
+  save(Thrds, file=save_file_name)
+  
+  print(paste('Saved ', nrow(Thrds), " records in ",save_file_name))
+  
+  return(Thrds) 
+}
+
+#################################################################################
+# streamlined version for threads that are PART OF A VISIT.  So you can merge with visits
+# to get the visit context.  Just compute the minimum for speed.
+# Needs to pass in the visit data to merge
+ACHR_batch_visit_role_threads <- function(occ, EVENT_CFs, ALL_CFs, visits) {
+  
+  library(tidyr)
+  library(dplyr)
+  library(ThreadNet)
+  library(ngram)
+  library(lubridate)
+  library(stringr)
+  library(data.table)
+  
+  
+  # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic
+ # firstOcc=occ[occ$seqNum==1,]
+  
+  # always used threadNum for the TN
+  TN = 'threadNum'
+  
+  # Add  columns for combinations of CFs if needed
+  # don't need to add new column for the thread_CFs.  That one has to be here.
+  # thread_col = newColName(THREAD_CFs)
+  # TN = thread_col
+  # if  (!(thread_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,THREAD_CFs,thread_col) }
+  
+  new_event_col = newColName(EVENT_CFs)
+  if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+  
+  all_cf_col = newColName(ALL_CFs)
+  if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
+  
+  # set key on the data.table for the threadNum
+  occ=as.data.table(occ)
+  setkeyv(occ, TN)
+  
+  # get the list of buckets
+  bucket_list <- sort( unique(occ[[TN]]) )
+  
+  # get the number of buckets
+  N = length(bucket_list)
+  print(paste0('Number of buckets=', N  ))
+  
+  # allocate the data table with N rows, then for speed, we use := to update it.
+  ACHR = data.table(Visit_ID  = character(N), 
+                    Role_VR = character(N), 
+                    Role_ID_VR =  character(N), 
+                    threadNumVR =  integer(N),
+                    NEventsVR = integer(N),
+                    threadStartVR =  character(N), 
+                    ThreadDurationVR =double(N),
+                    LOS =double(N),
+                    NetComplexityVR=double(N),
+                    NodesVR=double(N),
+                    EdgesVR=double(N),
+                    CompressRatioVR = double(N),
+                    EntropyVR = double(N),
+                    ALL_CF_countVR =  integer(N),
+                    ALL_CF_entropyVR = double(N) )
+
+    #################################################
+  # add columns for time of each role (or something)
+  
+  # Now add columns for the CF counts.  
+  for (cf in ALL_CFs){
+    ACHR[, paste0(cf,"_countVR"):= double(N)]
+    # ACHR[, paste0(cf,"_compression"):= double(N)]
+    # ACHR[, paste0(cf,"_entropy"):= double(N)]
+  }
+  
+  # update data table with results - only compute the ones that require looking at the whole thread
+  for (b in 1:N){
+    
+    if (b %% 1000 ==0) print(paste0('Thread count =  ',b))
+    
+    # select a subset 
+    df= occ[get(TN)==bucket_list[b]]
+    
+    # get the network -- only if there are enough rows...
+    if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+    else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
+    
+    # compute each parameter and update the table
+    ACHR[b,Visit_ID := as.character(df$Visit_ID[1]) ]
+    ACHR[b,Role_VR:= as.character(df$Role[1]) ]
+    ACHR[b,Role_ID_VR:= as.character(df$Role_ID[1]) ]
+    ACHR[b,threadNumVR := bucket_list[b] ]
+    ACHR[b,NEventsVR := nrow(df) ]
+    ACHR[b,threadStartVR := as.character(min(lubridate::ymd_hms(df$tStamp))) ]
+    ACHR[b,ThreadDurationVR:= difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours' ) ] 
+    ACHR[b, NetComplexityVR:=estimate_network_complexity( n ) ]
+    ACHR[b, NodesVR:=nrow(n$nodeDF) ]
+    ACHR[b,EdgesVR:=nrow(n$edgeDF) ]
+    ACHR[b,CompressRatioVR := compression_index(df,new_event_col) ]
+    # ACHR[b,EntropyVR := compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+    ACHR[b,EntropyVR := compute_graph_entropy( df[[new_event_col]])  ]
+    
+    ACHR[b,ALL_CF_countVR := length(unique(df[[all_cf_col]])) ]
+    # ACHR[b,ALL_CF_entropyVR := compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
+    ACHR[b,ALL_CF_entropyVR := compute_graph_entropy( df[[all_cf_col]])  ] 
+    
+    #################################################
+    # Compute time of each role 
+    # need to difftime start-end of each thread -- compute_thread_duration
+    # sum up the role chunks for each role within each thread
+    
+    
+    # Count the unique elements in each cf  
+    for (cf in ALL_CFs){  ACHR[b, paste0(cf,"_countVR") :=  length(unique(df[[cf]])) ] }
+  } 
+  
+  
+  # Merge the results with the first row from each thread
+  print('Merging results...')
+  VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID' ,all=TRUE) 
+  
+  save_file_name = paste0(paste('VRThrds',TN,new_event_col,sep='+'), '.Rdata')
+  save(VRThrds, file=save_file_name)
+  
+  print(paste('Saved ', nrow(VRThrds), " records in ",save_file_name))
+  
+  return(VRThrds) 
+}
+
+#################################################################################
+# streamlined version for adding or recomputing columns that describe a visit.  
+# So you can merge with visits to get rest of the columns.  Just compute the minimum for speed.
+# Can re-write this as needed... 
+ACHR_batch_visit_add_columns <- function(occ, EVENT_CFs, visits) {
+  
+  library(tidyr)
+  library(dplyr)
+  library(ThreadNet)
+  library(ngram)
+  library(lubridate)
+  library(stringr)
+  library(data.table)
+  
+  
+  # get the first occurrence from each thread -- this will be used for the stuff that never changes, like clinic
+  # firstOcc=occ[occ$seqNum==1,]
+  
+  # always used threadNum for the TN
+  TN = 'threadNum'
+  
+  # Add  columns for combinations of CFs if needed
+  # don't need to add new column for the thread_CFs.  That one has to be here.
+  # thread_col = newColName(THREAD_CFs)
+  # TN = thread_col
+  # if  (!(thread_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,THREAD_CFs,thread_col) }
+  
+  new_event_col = newColName(EVENT_CFs)
+  if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+  
+ 
+  # set key on the data.table for the threadNum
+  occ=as.data.table(occ)
+  setkeyv(occ, TN)
+  
+  # get the list of buckets
+  bucket_list <- sort( unique(occ[[TN]]) )
+  
+  # get the number of buckets
+  N = length(bucket_list)
+  print(paste0('Number of buckets=', N  ))
+  
+  # allocate the data table with N rows, then for speed, we use := to update it.
+  ACHR = data.table(Visit_ID  = character(N), 
+                    new_wait_time1 = double(N),
+                    new_wait_time2 = double(N) )
+                    
+                    # Role_VR = character(N), 
+                    # Role_ID_VR =  character(N), 
+                    # threadNumVR =  integer(N),
+                    # NEventsVR = integer(N),
+                    # threadStartVR =  character(N), 
+                    # ThreadDurationVR =double(N),
+                    # NetComplexityVR=double(N),
+                    # NodesVR=double(N),
+                    # EdgesVR=double(N),
+                    # CompressRatioVR = double(N),
+                    # EntropyVR = double(N),
+                    # ALL_CF_countVR =  integer(N),
+                    # ALL_CF_entropyVR = double(N) )
+  
+  
+  # update data table with results - only compute the ones that require looking at the whole thread
+  for (b in 1:N){
+    
+    if (b %% 1000 ==0) print(paste0('Thread count =  ',b))
+    
+    # select a subset 
+    df= occ[get(TN)==bucket_list[b]]
+    
+    # get the network -- only if there are enough rows...
+    # if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+    # else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
+    # 
+    # compute each parameter and update the table
+    ACHR[b,Visit_ID := as.character(df$Visit_ID[1]) ]
+    ACHR[b, new_wait_time1 := compute_wait_time1(df) ]
+    ACHR[b, new_wait_time2 := compute_wait_time2(df) ]
+  } 
+  
+  
+  # Compute the alignment of the context factors
+  #  ACHR$CF_AlignmentVR =  as.numeric( as.character(ACHR$Action_countVR)) / as.numeric( as.character(ACHR$ALL_CF_countVR ))
+  
+  # Merge the results with the first row from each thread
+  print('Merging results...')
+ # VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID' ,all=TRUE) 
+  VRThrds=merge(x=ACHR, y=visits, by.x='Visit_ID', by.y ='Visit_ID') 
+  
+  save_file_name = paste0(paste('VRThrds',TN,new_event_col,sep='+'), '.Rdata')
+  save(VRThrds, file=save_file_name)
+  
+  print(paste('Saved ', nrow(VRThrds), " records in ",save_file_name))
+  
+  return(VRThrds) 
+}
+##################################################################################
+# Helper functions
+#################################################################################
+compute_phase <- function(t){
+  
+  t=as.Date(t)
+  
+  if ( t >= '2016-01-01' & t < '2016-06-07')  return('one')
+  if ( t >= '2016-06-07' & t < '2016-09-01')  return('two')
+  if ( t >= '2016-09-01' & t < '2017-04-15')  return('three')
+  if ( t >= '2017-04-15' & t < '2017-09-01')  return('four')
+  if ( t >= '2017-09-01' & t < '2017-12-31')  return('five')
+    
+    # else return NA
+    return(NA)
+
+}
+compute_thread_duration <- function(df){
+  
+  return(difftime(lubridate::ymd_hms(df$tStamp[nrow(df)]),  lubridate::ymd_hms(df$tStamp[1]), units='hours' ))
+  
+}
+
+compute_visit_duration <- function(df){
+  
+  # get the checkin time.  Note that they sometimes look at patient data in advance of the visit, so you 
+  # cannot use the first occurrences to mark the start of the thread
+  w1= grep('CHECKIN_TIME',df$Action)[1]
+  w2= grep('AVS_PRINT_TIME',df$Action)[1]
+  
+  return(difftime(lubridate::ymd_hms(df$tStamp[w2]),  lubridate::ymd_hms(df$tStamp[w1]), units='hours' ))
+  
+}
+
+# First wait time... from checkin until they get their vitals taken
+compute_wait_time1 <- function(df){
+  
+  #  find the first  occurrence of MR_VN_VITALS. Corresponds closely to wait time
+ # w= grep('MR_VN_VITALS',df$Action)[1]
+  
+  w1= grep('CHECKIN_TIME',df$Action)[1]
+  w2= grep('VITALS',df$Action)[1]
+  
+  return( max(0, as.numeric(difftime( lubridate::ymd_hms(df$tStamp[w2]),lubridate::ymd_hms(df$tStamp[w1]),  units='hours' ) )))
+   
+}
+
+# Second wait time... from  vitals until they see a doctor or other medical person
+# NEW VERSION -- look for "AC_VISIT_NAVIGATOR" by a different role than "VITALS"
+compute_wait_time2 <- function(df){
+  
+  #  find the first  occurrence of MR_VN_VITALS. Get the time t1
+  # w1= grep('MR_VN_VITALS',df$Action)[1]
+  w1= grep('VITALS',df$Action)[1]
+  
+  if (is.na(w1))
+    return(NA)
+  else
+  {
+    t1 = lubridate::ymd_hms(df$tStamp[w1])
+    
+    
+    # get the workstation and role used for the vitals at t1
+    # work1 = df$Workstation[w1]
+    role1 = df$Role[w1]
+    
+    # get set of actions at the same workstation or different workstations that happen later in the visit
+    # Truncate the visit up to w1
+    restdf = df[(w1+1):nrow(df),]
+    
+    # get the actions performed by other roles in the rest of the visit
+    differentRole =  restdf[restdf$Role != role1,]
+    
+    # Now look  for first instance of  AC_VISIT_NAVIGATOR that is by one of the other roles
+       # first_AC_NAV =  grep('AC_VISIT_NAVIGATOR',differentRole$Action)[1]
+       # t2 = lubridate::ymd_hms(differentRole$tStamp[first_AC_NAV])
+       
+     # Try first instance of different role. period.  
+      t2 = lubridate::ymd_hms(differentRole$tStamp[1])
+      
+      
+    # return the difference in time from t1 to t2
+    return( max(0, as.numeric(difftime( t2, t1,  units='hours' ) )))
+  }
+}
+
+# Second wait time... from  vitals until they see a doctor or other medical person
+# compute_wait_time2 <- function(df){
+#   
+#   #  find the first  occurrence of MR_VN_VITALS. Get the time t1
+#  # w1= grep('MR_VN_VITALS',df$Action)[1]
+#   w1= grep('VITALS',df$Action)[1]
+#   
+#   if (is.na(w1))
+#     return(NA)
+#   else
+#   {
+#   t1 = lubridate::ymd_hms(df$tStamp[w1])
+#   
+#   
+#   # get the workstation and role used for the vitals at t1
+#   work1 = df$Workstation[w1]
+#   role1 = df$Role[w1]
+#   
+#   # get set of actions at the same workstation or different workstations that happen later in the visit
+#   # Truncate the visit up to w1
+#   restdf = df[w1:nrow(df),]
+# 
+#   # then get the set of action at the same workstation or different workstations (in that visit)
+#   sameWS =  restdf[restdf$Workstation == work1,]
+#   differentWS =  restdf[restdf$Workstation != work1,]
+#   
+#   # get list of other roles
+#   otherRoles = paste(setdiff(unique(restdf$Role),c('Technician','Technologist','Staff','Unknown')),
+#                     collapse = '|')
+#   
+#   # Now find the marker for when they see a physician
+#   # procedure is differenrt in BRKPT and HHPOB clinics
+#   if (df$Clinic[1]  %in% c('BRKPT','HHPOB') )
+#     { # LPN brings a different workstation on wheels at these clinics  
+#     firstLPN =  grep('LPN',differentWS$Role)[1] 
+#     t2 = lubridate::ymd_hms(differentWS$tStamp[firstLPN])
+#     
+#     # Other roles use the same workstation
+#     firstDoc =  grep('Physician|Resident|Registered', sameWS$Role)[1] 
+#     t2 = lubridate::ymd_hms(sameWS$tStamp[firstDoc])
+#     }
+#   else 
+#   {  # a different role, using the same workstation 
+#     w2 = max(w1, grep(otherRoles, sameWS$Role) )[1]
+#     t2 = lubridate::ymd_hms(sameWS$tStamp[w2])
+#     }
+#     
+#   # return the difference in time from t1 to t2
+#   return( difftime( t2, t1,  units='hours' ) )
+#   }
+# }
+
+
+# THIS ONE IS GOOD.
+count_procedures <- function( p ){
+  
+  # if no procedures, return zero
+  if (is.na( p )) return( 0 )
+  
+  # get the overall number of items
+  total_num = str_count( p, '#@#')+1
+  
+  # adjust for the visit codes that are not actual procedures
+  v=c('99211','99212','99213','99214','99215','99201','99202','99203','99204','99205')
+  
+  # just look at the first occurrence in the df
+  num_visit_codes  = sum(str_count( p,  v ))
+  
+  # limit  to non-negative 
+  total_num = max(0, total_num - num_visit_codes) 
+  
+  return( total_num  )
+}
+
+# This one  is easy
+count_diagnoses <- function(d){
+  
+  # Sometimes the marker is missing, so set the floor to one.  There is always at least one. 
+  return( max(1, str_count( d, '#@#' )))
+}
+
+count_daily_procedures <- function( df ) {
+  
+  # get the string of procedures for that day
+  allproc  = unlist(df[df$seqNum==1, 'Proc'])
+  
+  if (length(allproc)>0)  
+    return( sum(sapply(allproc, function(p) {count_procedures(p)} )) )
+  else return(0)
+  
+  
+}
+
+# convert the LOS_CPT codes into a five point numeric scale
+# operate on whole column to create new column
+convert_LOS <- function(los_cpt){
+  
+  # get the 5th character
+  s = sapply(los_cpt, function(cpt)  { substring(as.character(cpt),5,5) } )
+  
+  los = as.numeric(s)
+  
+  # convert 9 and NA to 1
+  los[los==9]=NA
+
+
+return(los)  
+  
+}
+
+###################################################################
+###################################################################
+# This is function is set up to aggregate the occurrences among collections of visit -- typically clinic_days
+# occ = pre-processed threaded occurrences
+# CFs can be chosen  
+ACHR_batch_clinic_days <- function(occ,TN='Clinic_ymd', EVENT_CFs, ALL_CFs) {
+  
+  library(tidyr)
+  library(dplyr)
+  library(ThreadNet)
+  library(ngram)
+  library(lubridate)
+  library(stringr)
+  library(data.table)
+  
+  # make a list of unique buckets.   
+  bucket_list <- unique(occ[[TN]])
+  N =  length(bucket_list) 
+  
+  # print the number of buckets
+  print(paste0('Number of buckets=', N  ))
+  
+  # Add  columns for combinations of CFs if needed
+  new_event_col = newColName(EVENT_CFs)
+  if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CFs,new_event_col) }
+  
+  all_cf_col = newColName(ALL_CFs)
+  if  (!(all_cf_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,ALL_CFs,all_cf_col) } 
+  
+  # this will speed up retrieving the subsets
+  setkeyv(occ, TN)
+  
+  # allocate the data table with N rows, then for speed, we use := to update it.
+  cds = data.table(Clinic_ymd  = character(N), 
+                    Clinic = character(N),
+                    ymd = character(N),
+                    Phase =  character(N), 
+                    NEvents = integer(N),
+                    ClinicDayStart =  character(N), 
+                    ClinicDayDuration =double(N),
+                    NumVisits =  integer(N),
+                    NumUniqueDiagnosisGroups = integer(N), 
+                    NumPhysicians = integer(N),
+                    TotalStaff = integer(N),
+                    NetComplexity=double(N),
+                    Nodes=double(N),
+                    Edges=double(N),
+                    CompressRatio = double(N),
+                    Entropy = double(N),
+                    ALL_CF_count =  integer(N),
+                    ALL_CF_entropy = double(N),
+                    CF_Alignment  = double(N))
+  
+  # Now add columns for the CF counts.  
+  for (cf in ALL_CFs){
+    cds[, paste0(cf,"_countVR"):= double(N)]
+    # cds[, paste0(cf,"_compression"):= double(N)]
+    # cds[, paste0(cf,"_entropy"):= double(N)]
+  }
+  
+  
+  # make data frame with results
+ for (b in 1:N){
+   
+   
+   if (b %% 100 ==0) print(paste0('Thread count =  ',b))
+   
+   
+   # select a subset of occurrences for the bucket
+   df = occ[ occ[[TN]] == bucket_list[b]  ] 
+  
+  
+  # get the network -- only if there are enough rows...
+  if (nrow(df)>2)   n = threads_to_network_original(df,TN, new_event_col) 
+  else  n = list(edgeDF=t(c(0)),nodeDF=t(c(0)))
+  
+  # compute each parameter and put them in a vector
+  
+  cds[b,Clinic_ymd :=  bucket_list[b] ]
+  cds[b,Clinic := as.character( df$Clinic[1] ) ]
+  cds[b,ymd := as.character( df$ymd[1] ) ]
+  cds[b,Phase  :=   compute_phase(df$tStamp[1]) ]
+  cds[b,threadNum  :=  as.numeric(df[1,'threadNum']) ]
+  cds[b,NEvents  :=  nrow(df) ]
+  cds[b,ClinicDayStart :=  as.character(lubridate::ymd_hms(df$tStamp[1]))  ]
+  cds[b,ClinicDayDuration :=  difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units := 'hours' ) ]
+  cds[b,NumVisits  :=   length(unique( df$Visit_ID )) ]
+  cds[b,NumUniqueDiagnosisGroups  :=  length(unique( df$Diagnosis_Group )) ] 
+  cds[b,NumPhysicians  :=  length(unique( df$Physician )) ]
+  cds[b,TotalStaff  :=  length(unique( df$Role_ID )) ]
+  cds[b, NetComplexity := estimate_network_complexity( n ) ]
+  cds[b,Nodes := nrow(n$nodeDF) ]
+  cds[b,Edges := nrow(n$edgeDF) ]
+  cds[b,CompressRatio  :=  compression_index(df,new_event_col) ]
+  # cds[b,Entropy  :=  compute_entropy(table(df[[new_event_col]])[table(df[[new_event_col]])>0]) ]
+  cds[b,Entropy  :=  compute_graph_entropy( df[[new_event_col]])  ]
+  
+  cds[b,NumProceduresPerDay  :=  count_daily_procedures(df) ]
+  cds[b,CF_Alignment  :=  1 ]    # make placeholder, but compute below
+  cds[b,ALL_CF_count  :=  length(unique(df[[all_cf_col]])) ]
+  # cds[b,ALL_CF_entropy  :=  compute_entropy(table(df[[all_cf_col]])[table(df[[all_cf_col]])>0]) ] 
+  cds[b,ALL_CF_entropy  :=  compute_graph_entropy( df[[all_cf_col]])  ] 
+  
+  # Count the unique elements in each cf  
+  for (cf in ALL_CFs){  cds[b, paste0(cf,"_countVR") :=  length(unique(df[[cf]])) ] }
+  
+  
+} 
+  
+  # Compute the alignment of the context factors
+  cds$CF_Alignment =  as.numeric( as.character(cds$Action_count)) / as.numeric( as.character(cds$ALL_CF_count ))
+  
+  save(cds, file=paste0(paste('cds',TN,new_event_col,sep='+'), '.Rdata'))
+  
+  return(cds) 
+  
+}
+
+compute_graph_entropy <- function(s){
+  
+  # guard against insufficient data...
+  if (length(s) <2) return(0)
+  
+  # convert s into text vector
+  text_vector =  concatenate(s) 
+  
+  # get the 2-grams in the sequence s. 'prop' is the proportion of each edge. It sums to 1. 
+  p = get.phrasetable(ngram(text_vector,2))[['prop']]
+  
+  # return Shannon entropy
+  return(-sum(p*log(p)))
+  
+}
+
+get_action_frequencies <- function(s,n){
+  
+  # guard against insufficient data...
+  if (length(s) < n) return(0)
+  
+  # convert s into text vector
+  text_vector =  concatenate(s) 
+  
+  # get the 2-grams in the sequence s. 'freq' is the frequency
+ return( get.phrasetable(ngram(text_vector, n )) )
+  
+}
+
+
diff --git a/Data Prep/ACHR_Batch_PreProcess.R b/Data Prep/ACHR_Batch_PreProcess.R
new file mode 100644
index 0000000..96800d3
--- /dev/null
+++ b/Data Prep/ACHR_Batch_PreProcess.R	
@@ -0,0 +1,246 @@
+##########################################################################################################
+# THREADNET:  Batch processing for larger data sets
+#
+#  ACHR_Batch_PreProcess
+#
+# (c) 2018 Michigan State University. This software may be used according to the terms provided in the
+# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
+# Brian Pentland
+# Absolutely no warranty!
+#
+# Separate out the two basic functions
+# 1) reading/cleaning the occurrences
+# 2) Threading the data by  adding ThreadNum and  SeqNum to  the  threads
+#
+##########################################################################################################
+library(tidyr)
+library(data.table)
+library(dplyr)
+library(ThreadNet)
+library(ngram)
+library(lubridate)
+
+redo_ACHR_data_from_scratch  <- function(fname){
+
+  setwd("~/Documents/@NSF Healthcare Routines/Deidentified_Derm_data_Nov_2018")
+  
+  fname = 'audit_111818'
+  
+  # Pick you point of view, or multiple POV...
+  THREAD_CF = c('Visit_ID')
+ # EVENT_CF = c('Action','Role','Workstation')
+  EVENT_CF = c('Action')
+  ALL_CF = c('Action','Role','Workstation')
+  
+  #  first read the data
+  o = read_ACHR_data( fname )  # nrow(o) = 7728927
+  
+  # fix the role IDs -- needs file called RoleChangeTable.csv in the local directory
+  o = fix_derm_role_ID(o)
+  
+  # Thread occurrences  adds threadNum and seqNum to each thread, as defined by the the thread_CF.
+  # Threads are always WITHIN VISITS in this code -- so they can be whole visits or chunks of visits (e.g., visit_ID_Role)
+  # make two versions -- one for whole visits and one for visit_ID_Role
+  ot = thread_occurrences( o, 'Visit_ID' ,fname )   # nrow(ot)= 57835
+ # otr = thread_occurrences( o, c('Visit_ID','Role') ,fname ) # nrow(otr) = 527666
+  otrw = thread_occurrences( o, c('Visit_ID','Role','Workstation') ,fname ) # nrow(otrw) > 527666
+  
+  
+  #  Now aggregate by clinic-day for  each POV
+  clinic_days = ACHR_batch_clinic_days(occ, THREAD_CF, EVENT_CF, fname)
+  
+  # Aggregate by visit for each POV
+  # Whole visits
+  visits = ACHR_batch_visits(ot, THREAD_CF, EVENT_CF, fname)
+  
+  # And then by role
+  VRThreads <- ACHR_batch_visit_role_threads(otr, c('Action'), c('Action','Role','Workstation') , visits)
+
+    
+  # compute  trajectories to see difference from a reference graph
+  traj = graph_trajectory( ot, 'Clinic_ymd', EVENT_CF, 2, 1, 0, 'Clinic_trajectory')
+  
+}
+
+# this function reads the raw data from URMC
+# It returns the  data  frame  and  also saves it as an Rdata file with the  same name as the csv
+# Tested Nov 16.
+read_ACHR_data <- function(fname){
+
+
+  
+  
+  # This code is tailored for reading in new data from URMC, October 2018
+  # Assumes there  is a column  called 'Timestamps" in column 13 and  "Visit_ID"
+  # read the file into data.table
+   d <-   fread( paste0(fname, '.csv') )
+  
+  # Sort by visit and timestamps (will do this later with setkey)
+   d  <- arrange(d,desc(Visit_ID,asc(Timestamps)))
+
+  # This  stuff  is URMC specific 
+  #
+  # move and rename the columns, as needed
+  # Mainly need  to get the  Timestamps into the first column and rename them tStamp
+  cn=colnames(d)
+  cnx = cn[c(13,1:12,14:15)]
+  d=setcolorder(d,cnx)
+  setnames(d,'Timestamps','tStamp')
+  setnames(d,'V1','seqn')
+
+   # Fix the name of the Highland  clinic
+   d$Clinic = gsub('HH POB','HHPOB',d$Clinic)
+  
+  
+  # This converts numbers to char and replaces spaces with underscore
+    d = cleanOccBatch(d)
+ 
+    ### This is URMC specific: Add the clinic and clinic_day  and fix the code for the highland  clinic
+   
+   # get the date only from the timestamp
+    d$ymd <- format(  as.POSIXct(d$tStamp, format="%Y-%m-%d"),  "%Y-%m-%d" ) 
+   
+   # make new columns for clinic + day and events
+   d = unite(d, 'Clinic_ymd', c('Clinic','ymd'),sep='_',remove = 'false')
+   
+   
+  # Save the result   
+  save_file_name = paste0(fname, '.rData')
+  save(d, file=save_file_name) 
+  print(paste("Saved ",nrow(d), " occurrences in ",save_file_name))
+  
+  return(d)
+  
+}
+
+# clean up the raw occurrence data
+# Remove blanks for n-gram functionality
+cleanOccBatch <- function(fileRows){
+  
+  # extract tStamp
+  tStamp <- fileRows$tStamp
+  
+  # First  get  rid of any underscores pre-existing in the data, then
+  # confirm all spaces are converted to underscores in non tStamp columns; set as factors
+  cleanedCF <- data.frame(lapply(fileRows[2:ncol(fileRows)], function(x){  gsub(" ","_",x)  }) )
+  
+  # bind tStamp back to cleaned data
+  complete <- as.data.table(cbind(tStamp,cleanedCF))
+  
+  # Old code  for forcing tStamp into a "YMD_HMS" format -- not  needed  for  URMC  EMR  data
+   # complete$tStamp <- as.character(complete$tStamp)
+   # complete$tStamp <- parse_date_time(complete$tStamp, c("dmy HMS", "dmY HMS", "ymd HMS","dmy HM", "dmY HM", "ymd HM"))
+   
+  # add weekday and month
+  # complete$weekday <- as.factor(weekdays(as.Date(complete$tStamp)))
+  # complete$month   <- as.factor(months(as.Date(complete$tStamp)))
+  
+  return(complete)
+}
+  
+#######################################################################################
+# In the derm data, the role "technician" was used for people who were not technicians
+# Pass in the table of occurrences, fix it, and return it
+# read in the list of changes
+fix_derm_role_ID <- function(o){
+  
+  # read in the new role ID
+  rc = read.csv('RoleChangeTable.csv', stringsAsFactors = FALSE)
+  
+  
+  #  occurrences are a data.table so use :=
+  for (r in 1:nrow(rc)) {
+    
+    old_rid = rc$OLD_Role_ID[r]
+    new_rid = rc$NEW_Role_ID[r]
+    new_role = rc$NEW_Role[r]
+    
+    # pick out rows with old Role_ID
+    # old_rid_vector = o$Role_ID==old_rid
+    # 
+    # o[old_rid_vector, Role_ID := new_rid]
+    # o[old_rid_vector, Role := new_role]
+    
+    # crazy data table  syntax... do it all in one shot
+    o[o$Role_ID==old_rid, ':=' (Role = new_role, Role_ID = new_rid ) ]
+    
+  }
+  
+  return(o)
+}
+
+
+
+
+#########################################################################################
+# This function adds columns  for  the the thread, as requested ( Tested Nov 16 )
+# Then it sorts by thread and tStamp and  adds thread/sequence numbers to  each thread.
+# 
+# Typical column names from the UMRC file
+# ALL_CF = c('Action','Workstation','Role','Clinic','Diagnosis_Group')
+# thread_CF = c('Visit_ID', 'Role', 'Workstation')
+# event_CF = c('Action','Role','Workstation')
+# It returns  the threaded set of occurrences  and also saves it as  Rdata. 
+thread_occurrences <- function(occ, THREAD_CF, fname='emr'){
+  
+  print(paste('Number of occurrences: ', nrow(occ)))
+  
+  # these will be  new column names
+  new_thread_col = newColName(THREAD_CF)
+  # new_event_col = newColName(EVENT_CF)
+  
+  print(paste('new_thread_col: ',new_thread_col))
+  # print(paste('new_event_col: ', new_event_col))
+  
+  # Add names for the new columns, if necessary
+  if  (!(new_thread_col  %in% colnames(occ))) {  occ = combineContextFactors(occ,THREAD_CF,new_thread_col) }
+  # if  (!(new_event_col  %in% colnames(occ)))  {  occ = combineContextFactors(occ,EVENT_CF,new_event_col) }
+  
+  # ThreadNet code assumes these columns will be there, so we need to add them
+  occ$threadNum = integer(nrow(occ))
+  occ$seqNum =   integer(nrow(occ))
+  
+  # Assume the data are sorted by visit_ID and tStamp -- the natural flow of the visit.
+  idx_list = which(occ[[new_thread_col]] !=dplyr::lag(occ[[new_thread_col]]))
+  idx_list = c(idx_list, nrow(occ))
+  
+  # creates two lists of indices so they match up correctly. 
+  start_idx = c(1, head(idx_list,-1))  # add 1 to the front and drop the last
+  end_idx =   idx_list - 1    # shift back by one
+  
+  print(paste('Number of threads in this POV: ', length(idx_list)))
+  
+  # sapply(seq(length(idx_list)), function(x)
+    for (x in seq(length(idx_list)))
+    {   if (x %% 1000 ==0) print(paste0('Thread count =  ',x))
+     occ[start_idx[x]:end_idx[x],'threadNum' := x ]
+     occ[start_idx[x]:end_idx[x],'seqNum':= c(1:(end_idx[x]-start_idx[x]+1)) ]
+    }
+  
+  # Now save the result for  later and return it, as well. 
+  #  save(occ, file=paste0(paste(fname,new_thread_col,new_event_col,sep='+'), '.Rdata'))
+  
+  save_file_name = paste0(paste0(paste(fname,new_thread_col,sep='+'), '.rData'))
+  save(occ, file=save_file_name)
+  
+  print(paste('Saved threaded occurrences: ', nrow(occ), " in ",save_file_name))
+
+  return(occ)
+}
+
+
+
+  ############   extra  stuff not used  #############
+  # split occ data frame by threadNum to find earliest time value for that thread
+  # # then substract that from initiated relativeTime from above
+  # occ_split = lapply(split(occ, occ$threadNum),
+  #                    function(x) {x$relativeTime = difftime(x$relativeTime,  min(lubridate::ymd_hms(x$tStamp)), units=timescale ); x})
+  # 
+  # # # row bind data frame back together
+  # occ= data.frame(do.call(rbind, occ_split))
+
+
+
+  
+
+  
diff --git a/Data Prep/ACHR_Batch_V2.R b/Data Prep/ACHR_Batch_V2.R
index 730f0b5..818ef08 100644
--- a/Data Prep/ACHR_Batch_V2.R	
+++ b/Data Prep/ACHR_Batch_V2.R	
@@ -35,7 +35,7 @@
 
 # this function reads the raw data from URMC and creates files for clinic_day and visits
 
-read_ACHR_data <- function(){
+read_ACHR_data_old <- function(){
 
   library(tidyr)
   library(data.table)
@@ -48,7 +48,9 @@ read_ACHR_data <- function(){
   
   # This code is tailored for reading in new data from URMC, October 2018
   # read the file into data frame
-  d <<- fread('auditfinal_10022018.csv')
+   d <<- fread('auditfinal_10022018.csv')
+  
+ # d <<- fread('10test.csv')
   
   # Sort and convert to data.table
   vt <<-data.table(arrange(d,desc(Visit_ID,asc(Timestamps))))
@@ -58,7 +60,7 @@ read_ACHR_data <- function(){
   cnx = cn[c(13,1:12,14:15)]
   vt=setcolorder(vt,cnx)
   setnames(vt,'Timestamps','tStamp')
-  setnames(vt,'X','seqn')
+ # setnames(vt,'X','seqn')
   
   # make a dataframe copy just for fun
   vdf <<- as.data.frame(vt)
diff --git a/Data Prep/ACHR_Batch_V3.R b/Data Prep/ACHR_Batch_V3.R
index a404eec..bb80ee7 100644
--- a/Data Prep/ACHR_Batch_V3.R	
+++ b/Data Prep/ACHR_Batch_V3.R	
@@ -271,19 +271,23 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) {
       ACHR[b,A_Nodes := nrow(n$nodeDF) ]
       ACHR[b,A_Edges := nrow(n$edgeDF) ]
       
-      n = threads_to_network_original(df,TN, DV2)
-      ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
+      # n = threads_to_network_original(df,TN, DV2)
+      # ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
+      # ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
+      # ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
+      # 
+      # n = threads_to_network_original(df,TN, DV3)
+      # ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
+      # ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
+      # ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
       
-      n = threads_to_network_original(df,TN, DV3)
-      ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
+      # # get the entropy for AR and ARW
+      # ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
+      # ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
       
       # get the entropy for AR and ARW
-      ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
-      ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
+      ACHR[b, AR_Entropy  := compute_graph_entropy_TEST(df[[DV2]]) ]
+      ACHR[b, ARW_Entropy  := compute_graph_entropy_TEST(df[[DV3]]) ]
       
       
       # compute stuff on each context factor
@@ -296,7 +300,10 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) {
         ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
         
         # get the entropy
-        ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+        # ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+        
+        # get the graph entropy
+        ACHR[b, paste0(cf,"_entropy") := compute_graph_entropy_TEST(df[[cf]]) ]
         
       }
     } # kf nrows > 2
@@ -327,8 +334,10 @@ ACHR_batch_clinic_days <- function(occ,TN, CFs) {
   return(ACHR)
 }
 
-###################################################################
-###################################################################
+##################################################################
+##################################################################
+##################################################################
+##################################################################
 
 
 
@@ -373,6 +382,7 @@ ACHR = data.table(bucket=integer(N),
                   A_Entropy = double(N),
                   AR_Entropy = double(N),
                   ARW_Entropy = double(N),
+                  CF_Alignment = double(N), 
                   Visit_ID  = character(N), 
                   Subject_ID  = character(N), 
                   Clinic = character(N),
@@ -420,8 +430,8 @@ for (i in 1:N){
      
     # compressibility of DV
       ACHR[b,A_CompressRatio := compression_index(df,DV1)]
-      ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
-      ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
+      # ACHR[b,AR_CompressRatio := compression_index(df,DV2)]
+      # ACHR[b,ARW_CompressRatio := compression_index(df,DV3)]  
       
     # NetComplexity of DV
     # First get the network
@@ -432,20 +442,23 @@ for (i in 1:N){
       ACHR[b,A_Nodes := nrow(n$nodeDF) ]
       ACHR[b,A_Edges := nrow(n$edgeDF) ]
       
-      n = threads_to_network_original(df,TN, DV2)
-      ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
-      
-      n = threads_to_network_original(df,TN, DV3)
-      ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
-      ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
-      ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
+      # n = threads_to_network_original(df,TN, DV2)
+      # ACHR[b,AR_NetComplexity := estimate_network_complexity( n )]
+      # ACHR[b,AR_Nodes := nrow(n$nodeDF) ]
+      # ACHR[b,AR_Edges := nrow(n$edgeDF) ]     
+      # 
+       n = threads_to_network_original(df,TN, DV3)
+       ACHR[b,ARW_NetComplexity := estimate_network_complexity( n )]
+       ACHR[b,ARW_Nodes := nrow(n$nodeDF) ]
+       ACHR[b,ARW_Edges := nrow(n$edgeDF) ]  
       
-      # get the entropy for AR and ARW
-      ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
-      ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
+      # # get the entropy for AR and ARW
+      # ACHR[b, AR_Entropy  := compute_entropy(table(df[[DV2]])[table(df[[DV2]])>0]) ]
+      # ACHR[b, ARW_Entropy  := compute_entropy(table(df[[DV3]])[table(df[[DV3]])>0]) ]
       
+      # ACHR[b, AR_Entropy  := compute_graph_entropy_TEST( df[[DV2]] )  ]
+       ACHR[b, ARW_Entropy  := compute_graph_entropy_TEST( df[[DV3]] ) ]
+      # 
     
   # compute stuff on each context factor
   for (cf in CFs){
@@ -457,8 +470,9 @@ for (i in 1:N){
     ACHR[b, paste0(cf,"_compression") := compression_index(df,cf) ]
 
     # get the entropy
-    ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
-
+    # ACHR[b, paste0(cf,"_entropy") := compute_entropy(table(df[[cf]])[table(df[[cf]])>0]) ]
+    ACHR[b, paste0(cf,"_entropy") := compute_graph_entropy_TEST( df[[cf]])  ]
+    
   }
 } # kf nrows > 2
 
@@ -480,6 +494,8 @@ for (i in 1:N){
     ACHR[b,'Weekday'  := df[1,'Weekday']]
     ACHR[b,'Month'  := df[1,'Month']]
     
+    # Compute the alignment of the context factors
+    ACHR[b, 'CF_Alignment' :=    ACHR[b,Action_count] / ACHR[b,ARW_Nodes] ]
     
     
 } # loop thru buckets
@@ -491,6 +507,168 @@ ACHR[,'A_Entropy' := ACHR[,'Action_entropy'] ]
 return(ACHR)
 }
 
+#####################   ############################################################
+#####################   ############################################################
+#####################   ############################################################
+
+
+# ACHR stands for Antecedents of Complexity in Healthcare Routines.  
+# This is function is set up to compute process parameters on thousands of patient visits.
+# This version assumes that CFs = c('Action','Role','Workstation')
+ACHR_batch_visits_all_CFs <- function(occ,TN, CFs) {
+  
+  # Get list of columns we want to use/create
+  dv1=newColName(CFs[1])
+  dv2=newColName(CFs[2])
+  dv3=newColName(CFs[3])
+  dv4=newColName(CFs[2:3])
+  dv5=newColName(CFs[1:2])
+  dv6=newColName(CFs[1:3])
+  
+  # need to make new columns
+  occ = combineContextFactors(occ,CFs[2:3], newColName(CFs[2:3]))
+  occ = combineContextFactors(occ,CFs[1:2], newColName(CFs[1:2]))
+  occ = combineContextFactors(occ,CFs[1:3], newColName(CFs[1:3]))
+  
+  # We  will  loop thru this list below
+  dv_list = c(dv1, dv2, dv3, dv4, dv5, dv6)
+  print(dv_list)
+ 
+  # set key on the data.table for the threadNum to speed retrival
+  occ=as.data.table(occ)
+  setkeyv(occ, TN)
+  
+  # pick subsets -- one visit at a time in this version, but could be more
+  bucket_list <- make_buckets_1(occ, 'threadNum')
+  
+  # get the size (number of buckets)
+  N = length(bucket_list)
+  # print(bucket_list)
+  
+  
+  # pre-allocate the data.table.  Tables are supposed to be faster.
+  ACHR = data.table(bucket=integer(N),
+                    NEvents = integer(N),
+                    VisitStart= character(N),
+                    VisitDuration=double(N),   
+                    CF_Alignment = double(N), 
+                    Visit_ID  = character(N), 
+                    Subject_ID  = character(N), 
+                    Clinic = character(N),
+                    LOS_CPT = character(N),
+                    NumProcedures = double(N),
+                    NumDiagnoses = double(N),
+                    Procedure = character(N),
+                    Diagnosis = character(N),   # diag in the raw data
+                    Diagnosis_group  = character(N), 
+                    Physician  = character(N), 
+                    Weekday  = character(N), 
+                    Month  = character(N)
+  )
+  
+  # Now add columns for each dv. 
+  for (dv in dv_list){
+    
+    ACHR[, paste0(dv,"_nodes"):= double(N)]
+    ACHR[, paste0(dv,"_edges"):= double(N)]
+    ACHR[, paste0(dv,"_complexity"):= double(N)]
+    ACHR[, paste0(dv,"_entropy"):= double(N)]
+    
+  }
+  
+  # loop through the buckets. Result will be data frame with one row per bucket
+  for (i in seq(N,1,-1)){
+    
+    b = i #  as.integer(bucket_list[i])
+    
+    # bucket number
+    ACHR[b,bucket := b]
+    
+    # print once every 100 visits
+    # print(b)
+    # print (bucket_list[i])
+    
+    # save data every 1000 visits
+    if (b%%1000==0) {
+      print(b)
+      ACHR_partial = ACHR[ACHR$NEvents>1,]
+      save(ACHR_partial, file=paste0("visits_ARW_",nrow(ACHR_partial),'.Rdata'))}
+    
+    # select a subset 
+    df= subset(occ,  threadNum==bucket_list[i] )
+    
+    # only do the computations if there are more than two occurrences
+    if (nrow(df) > 2) {
+      
+    # make sure it is sorted by timestamp
+    df=df[order(df$tStamp),]
+  
+  
+    # length of the thread (number of rows)
+    ACHR[b,NEvents := nrow(df)]
+    # print( nrow(df) )
+    
+      
+      # compute the duration of the visit in hours
+      ACHR[b,VisitDuration := difftime(max(lubridate::ymd_hms(df$tStamp)),  min(lubridate::ymd_hms(df$tStamp)), units='hours') ]
+   
+      # compute stuff on each DV in the list
+      for (dv in dv_list){
+        
+        n = threads_to_network_local(df,TN, dv)
+        
+        ACHR[b,paste0(dv,"_complexity") := estimate_network_complexity( n )]
+        ACHR[b,paste0(dv,"_nodes") := nrow(n$nodeDF) ]
+        ACHR[b,paste0(dv,"_edges") := nrow(n$edgeDF) ]  
+        ACHR[b, paste0(dv,"_entropy")  := compute_graph_entropy( df[[dv]] ) ]
+         
+        # print (dv)
+        # print( nrow(n$nodeDF) )
+        # print( length(unique(df[[dv]])))
+        
+      }
+    } # df nrows > 3
+    
+    # Now copy in the rest of data  
+    # this only works because one visit is one bucket
+    ACHR[b,'VisitStart'  :=   as.character(df[1,'tStamp']) ]
+
+    ACHR[b,Visit_ID  := df[1,'Visit_ID']]
+    ACHR[b,Subject_ID  := df[1,'Subject_ID']]
+    ACHR[b,Clinic := df[1,'Clinic']]
+    ACHR[b,LOS_CPT := df[1,'LOS_CPT']]
+    ACHR[b,Procedure := df[1,'Proc']]
+    ACHR[b,Diagnosis := df[1,'Diag']]
+    ACHR[b,NumProcedures := count_procedures(df$Proc[1]) ]
+    ACHR[b,NumDiagnoses := count_diagnoses(df$Diag[1]) ]
+    ACHR[b,Diagnosis_group  := df[1,'Diagnosis_Group']]
+    ACHR[b,Physician  := df[1,'Physician']]
+    ACHR[b,Weekday  := df[1,'Weekday']]
+    ACHR[b,Month  := df[1,'Month']]
+    
+    # Compute the alignment of the context factors
+    # This assumes   CFs = c('Action','Role','Workstation')
+    # print(  ACHR[b,Action_nodes]  )
+    # print(  ACHR[b,Action_Role_Workstation_nodes]  )
+    ACHR[b, 'CF_Alignment' :=   ACHR[b,Action_nodes]  / ACHR[b,Action_Role_Workstation_nodes]  ]
+  
+  } # loop thru buckets
+  
+  # save the  data one last time
+  save(ACHR, file=paste0("visits_ARW_",b,'.Rdata'))
+  
+  # return the table
+  return(ACHR)
+}
+
+
+#####################   ############################################################
+#####################   ############################################################
+#####################   ############################################################
+#####################   ############################################################
+#####################   ############################################################
+#####################   ############################################################
+
 
 # Each bucket is a list of thread numbers that can be used to subset the list of occurrences
 make_buckets_1 <- function(o, criteria){
@@ -539,3 +717,100 @@ cleanOccBatch <- function(fileRows){
 
 get_timeScale <- function(){'hr'}
 
+# create a function to compute graph entropy
+# It will use the standard Shannon entropy, but it will apply it to the frequencies of edges in the 
+# adjacency matrix.  To do this, we just need to count the 2-grams in the sequence and divide by the total 
+# number of 2-grams to get the probabilities.  You should be able to pass in any sequence. 
+# NOTE: This version will work on one sequence at a time (eg. one patient visit)
+# Needs to be revised to work on groups of visits (e.g,. clinic-days) 
+compute_graph_entropy_TEST <- function(s){
+  
+  # first convert s into text vector
+  text_vector =  long_enough( concatenate(s) , 2, ' ')
+ # text_vector =   concatenate(s)  
+  
+ #  if  (length(text_vector)<2) {return(0)}
+  
+  # get the 2-grams in the sequence s. ng$prop is the proportion of each edge. It sums to 1. 
+  p = get.phrasetable(ngram(text_vector,2))[['prop']]
+ 
+  # return Shannon entropy
+  return(-sum(p*log(p)))
+   
+}
+# to avoid errors in count_ngrams, make sure the length of each thread in the text_vector tv is longer than the n-gram size, n
+# this gets used in various places so need to pass in the delimiter
+long_enough = function(tv,n,delimiter){
+  
+  return(tv[ unlist(lapply(1:length(tv), function(i) {length(unlist(strsplit(tv[[i]],delimiter)))>=n})) ])
+  
+}
+
+threads_to_network_local <- function(et,TN,CF,grp='threadNum'){
+  
+  # print(head(et))
+  #
+  # print(paste('CF=', CF))
+  # print(paste('grp=', grp))
+  
+  # First get the node names & remove the spaces
+  node_label = levels(factor(et[[CF]]))  # unique(et[[CF]])
+  node_label=str_replace_all(node_label," ","_")
+  nNodes = length(node_label)
+  
+  # print("node_label")
+  # print(node_label)
+  # print(paste('nNodes=', nNodes))
+  
+  node_group=character()
+  for (n in 1:nNodes){
+    # hardcoded threadNum for data.table syntax
+    node_group = c(node_group, as.character(unlist( et[which(et[[CF]]==node_label[n]),threadNum][1]) ) )
+  }
+  
+  # set up the data frames we need to draw the network
+  nodes = data.frame(
+    id = 1:length(node_label),
+    label = node_label,
+    Group = node_group,
+    title=node_label)
+  
+  # get the 2 grams for the edges
+  ngdf = count_ngrams(et,TN, CF, 2)
+  
+  # Adjust the frequency of the edges to 0-1 range
+  ngdf$freq = round(ngdf$freq/max(ngdf$freq),3)
+  
+  # need to split 2-grams into from and to
+  from_to_str = str_split(str_trim(ngdf$ngrams), " ", n=2)
+  
+  # need to find a better way to do this...
+  nEdges = length(from_to_str)
+  # from_labels=matrix(data="", nrow=nEdges,ncol=1)
+  # to_labels =matrix(data="", nrow=nEdges,ncol=1)
+  # from=integer(nEdges)
+  # to=integer(nEdges)
+  # for (i in 1:length(from_to_str)){
+  #   
+  #   # Get from and to by spliting the 2-gram
+  #   from_labels[i] = str_split(from_to_str[[i]]," ")[1]
+  #   to_labels[i] = str_split(from_to_str[[i]]," ")[2]
+  #   
+  #   # use match to lookup the nodeID from the label...
+  #   from[i] = match(from_labels[i], nodes$label)
+  #   to[i] = match(to_labels[i], nodes$label)
+  # }
+  # 
+  # Stopped filtering out selfies July 20, 2019 for  Kerstin Sailer bug  report
+  edges = data.frame(
+    # from,
+    # to,
+    label = ngdf$freq,
+    Value =ngdf$freq) # %>% filter(!from==to)
+  
+  # print(paste("T2N nodes:",nodes))
+  # print(paste("ngdf = :",ngdf))
+  # print(paste("edges= :",edges))
+  
+  return(list(nodeDF = nodes, edgeDF = edges))
+}
diff --git a/Data Prep/ACHR_Clinic_Trajectories_v3.R b/Data Prep/ACHR_Clinic_Trajectories_v3.R
index 48525c5..56376e1 100644
--- a/Data Prep/ACHR_Clinic_Trajectories_v3.R	
+++ b/Data Prep/ACHR_Clinic_Trajectories_v3.R	
@@ -8,6 +8,11 @@
 
 # these functions will take clinic data and produce trajectories (distance of the graph from a reference graph)
 
+# need to rethink how this selects occurrences in buckets.  
+#  Should buckets always be by clinic_day??  This allows us to track what happens in each clinic over time.  
+
+#  We want to track roles over time in each clinic.  So it's Clinic_day and also visit_role.  We have threads by visit_role
+
 # start with auditfinal_10022018.csv --> use ACHR_Batch_v2 to make new_occ.  
 # The code here starts with new_occ (~ 7.72m occurrences that we break down into 1857 clinic_days and 57835 visits)
 
@@ -22,20 +27,11 @@ library(philentropy)
 library(stringr)
 library(zoo)
 
-# need  function to count the number  of procedures and number  of diagnoses in the data.  
-# some visits  have  dozens of  them 
-# need to pass in data frame that just has the  column with the procedures or diagnoses
-count_procs <- function(o){
-  print(nrow(o))
-  
-unlist(lapply(1:nrow(o), function(i){ str_count(o[i], pattern='#@#')  }))
-                 
-}
 
 # Use this to put the trajectory data with the clinic_day data  
 merge_clinic_days_and_trajectories <- function(){
   
-  clinic_days$Clinic_date = gsub('HH_POB','HHPOB',clinic_days$Clinic_date)
+  # clinic_days$Clinic_date = gsub('HH_POB','HHPOB',clinic_days$Clinic_date)
   cds=merge(x=test0,y=clinic_days,by.x='Clinic_ymd',by.y ='Clinic_date' ,all=TRUE) 
   
 }
@@ -58,31 +54,158 @@ get_bucket <- function(o, b ){
 
 
 # @param e data frame for POV
-# @param blist is the bucket list.  Each bucket is a "window"
+# @param bucket_CFs is the column used to define the bucket list, or the combination of columns needed to create the bucket.  Each bucket is a "window"
 # @param cf is the column that defines events (e.g. 'Action_Role')
-# @param filter list contains the threshold levels for filtering out edges with low frequency: c(0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1)
-graph_trajectory_filtered  <- function(e, cf, reference_day=1, filter_list,save_file_name) {
+# @param filter list contains the threshold level  for filtering out edges with low frequency.
+#
+# MAKE SURE YOU ARE  READING IN A SET OF OCCURRENCES THAT HAS THE CORRECT THREADS... 
+#
+graph_trajectory  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') {
   
-  # get total size of possible matrix
-  Max_Order = length(unique(e[[cf]]))^2
+  # add column for bucket_CF if needed
+  bucket_col = newColName(bucket_CFs)
+  if  (!(bucket_col  %in% colnames(e)))  {  e = combineContextFactors(e,bucket_CFs,bucket_col) }
   
+  # here is the sorted list of buckets
+  blist = sort(unique(e[[bucket_col]]))
+  
+  # now many buckets?
+  nb = length(blist)
+  # print(blist)
+  print(paste('Number of buckets =',nb))
   
   # make data frame for results
-  vt=data.frame( ngrams=character(), freq=integer(), bid=integer() )
+  vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() )
   
-  # here is the list of buckets
-  blist = unique(e[['Clinic_ymd']])
+  bcount=0
+  # scan through the data
+  for (b in blist){
+    bcount= bcount +1
+    # print(paste('b =',b))
+    
+    # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE...
+    # get text vector for the whole data set.  Bucket needs at least 2 threads
+    # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
+    th = e[ e[[bucket_col]] ==b , ] 
+    
+    if (nrow(th)>3) 
+      {
+      ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]    
+    
+    
+    nodes = length(unique(th[[cf]]))
+    
+    
+     # print(paste('nrow ngdf =',nrow(ngdf)))
+    
+    # add the bucket number and name
+    ngdf$bid = bcount
+    ngdf$bname = b
+    
+    # append the columns to the end
+    # vt is the whole set of all ngrams in all the windows
+    vt=rbind(vt,ngdf)
+    }
+    }
+  
+  
+  # convert to factor
+  vt$ngrams = factor(vt$ngrams)
+  
+  # nWindows = length(unique(vt$bid))
+  nWindows = nb
+  #print(paste('nWindows =',nWindows))
+  
+  # get the set of unique ngrams for the whole data set
+  vt_unique = data.frame(ngrams=unique(vt$ngrams))
+  #print(vt_unique)
+  
+  # put the results here
+  windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique))
+  
+  for (i in 1:nWindows){
+    
+    # get the merged listunqi
+    vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE)
+    
+    # use the bid.y to get the whole vector, but replace the NA with zeros
+    bb=vtmerge[vtmerge$bid==i,'freq']
+    bb[is.na(bb)] <- 0
+    
+    windowFreqMatrix[i,]=bb
+    
+    print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0)))
+    
+  }
+  
+  # #Make a matrix to filter by where all the entries are 0 < f < 1
+  # fm = windowFreqMatrix/max(windowFreqMatrix)
+  # 
+  # # if the number if below the threshold, set it to zero
+  # windowFreqMatrix[fm<=filter_threshold] = 0
   
-  # take the underscore  out so we can use it to strsplit below... and sort it
-  blist=gsub('HH_POB','HHPOB',blist)
-  blist = sort(blist)
+  df = data.table(
+    Clinic_ymd = character(nWindows),
+    Clinic = character(nWindows),
+    # Role_ymd = character(nWindows),
+    # Role = character(nWindows),
+    ymd = character(nWindows),
+    pct_retained = double(nWindows),
+    complexity = double(nWindows),
+    Dist_from_reference = double(nWindows),
+    Dist_from_next = double(nWindows)
+  )
   
+  for (i in 1:(nWindows-1) ) {
+    
+    df[i,  Clinic_ymd := blist[i] ]
+    df[i,  Clinic := unlist(strsplit(blist[i],'_',fixed=TRUE))[1] ]
+    df[i,  ymd :=    unlist(strsplit(blist[i],'_',fixed=TRUE))[2] ]
+
+    # df[i,  Role_ymd := as.character(blist[i]) ]
+    # df[i,  Role := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[1] ]
+    # df[i,  ymd :=  unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[2] ]
+    
+    df[i,  pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ]
+    df[i,  complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ]
+    df[i,  Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ]
+    df[i,  Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ]
+  }
+  
+  
+  # get the ngram data and labels
+  if (keep_ngram_vectors) {
+    b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),])
+    colnames(b_df)=vt_unique$ngrams
+    
+    # stick the ngram frequencies on the end for good measure
+    df = cbind(df,b_df)
+  }
+  
+  # save the result and return it, too
+  save(df, file=paste0(save_file_name,'.rData') )
+  
+  return(df)
+  
+}
+##################################################################################
+
+graph_trajectory_ROLES  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') {
+  
+  # add column for bucket_CF if needed
+  bucket_col = newColName(bucket_CFs)
+  if  (!(bucket_col  %in% colnames(e)))  {  e = combineContextFactors(occ,bucket_CFs,bucket_col) }
+  
+  # here is the sorted list of buckets
+  blist = sort(unique(e[[bucket_col]]))
+
   # now many buckets?
   nb = length(blist)
-  print(blist)
-  print(paste('nb =',nb))
+  # print(blist)
+  print(paste('Number of buckets =',nb))
   
-  complexity_idx=list(nb)
+  # make data frame for results
+  vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() )
   
   bcount=0
   # scan through the data
@@ -90,9 +213,13 @@ graph_trajectory_filtered  <- function(e, cf, reference_day=1, filter_list,save_
     bcount= bcount +1
    # print(paste('b =',b))
     
-    # get text vector for the whole data set.  Bucket needs at least 2 threads
-      th = get_bucket(e, b)
-      if (nrow(th)>2) { ngdf = count_ngrams(th, 'threadNum', cf, 2)[1:2]  }
+     # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE...
+     # get text vector for the whole data set.  Bucket needs at least 2 threads
+    # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
+      th = e[ e[[bucket_col]] ==b , ] 
+      
+      if (nrow(th)>3) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]  }  
+      
       
       nodes = length(unique(th[[cf]]))
       
@@ -101,13 +228,12 @@ graph_trajectory_filtered  <- function(e, cf, reference_day=1, filter_list,save_
     
     # add the bucket number and name
     ngdf$bid = bcount
-    # ngdf$bname = b
+    ngdf$bname = b
     
     # append the columns to the end
     # vt is the whole set of all ngrams in all the windows
     vt=rbind(vt,ngdf)
   }
-   print(head(complexity_idx))
 
    
   # convert to factor
@@ -126,7 +252,7 @@ graph_trajectory_filtered  <- function(e, cf, reference_day=1, filter_list,save_
   
   for (i in 1:nWindows){
     
-    # get the merged list
+    # get the merged listunqi
     vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE)
     
     # use the bid.y to get the whole vector, but replace the NA with zeros
@@ -139,53 +265,272 @@ graph_trajectory_filtered  <- function(e, cf, reference_day=1, filter_list,save_
     
   }
   
-  #Make a matrix to filter by where all the entries are 0 < f < 1
-  fm = windowFreqMatrix/max(windowFreqMatrix)
-  
-  # Now filter out the edges 
-  for (f in filter_list) {
-  
-    # if the number if below the threshold, set it to zero
-    windowFreqMatrix[fm<=f] = 0
-
-  
-  #  correlate each row with the first one stick it in a dataframe
-  df =data.frame(window=1:(nWindows-1),
-                 Clinic_ymd = blist[1:(nWindows-1)],
-                 Clinic= unlist(lapply(1:(nWindows-1),
-                                       function(i){ unlist(strsplit(blist[i],'_'))[1] })),
-                 ymd=unlist(lapply(1:(nWindows-1),
-                                   function(i){ unlist(strsplit(blist[i],'_'))[2] })),
-                 pct_retained = unlist(lapply(1:(nWindows-1),
-                                              function(i){sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/
-                                                  sum(windowFreqMatrix[i,]>0)
-                                                  })) ,
-                 pct_possible = unlist(lapply(1:(nWindows-1),
-                                              function(i){sum(windowFreqMatrix[i,] > 0)/Max_Order })) ,
-                 complexity = unlist(lapply(1:(nWindows-1),
-                                            function(i){ estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) })) ,
-                 Dist_from_reference= unlist(lapply(1:(nWindows-1),
-                                                function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]),
-                                                                     method='cosine' ) })),
-                  Dist_from_next= unlist(lapply(1:(nWindows-1),
-                                                 function(i){distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]),
-                                                                      method='cosine' ) }))
-                 )
-  
-  save(df, file=paste0(save_file_name,f,'.rData') )
-  
-  }  # f in filter list
-  
- return(df)
+    # #Make a matrix to filter by where all the entries are 0 < f < 1
+    # fm = windowFreqMatrix/max(windowFreqMatrix)
+    # 
+    # # if the number if below the threshold, set it to zero
+    # windowFreqMatrix[fm<=filter_threshold] = 0
   
-  # get the ngram data and labels
-  # b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),])
-  # colnames(b_df)=vt_unique$ngrams
-  # 
-  # # stick the ngram frequencies on the end for good measure
-  # return(cbind(df,b_df))
+  df = data.table(
+    # Clinic_ymd = character(nWindows),
+    # Clinic = character(nWindows),
+    Role_ymd = character(nWindows),
+    Role = character(nWindows),
+    ymd = character(nWindows),
+    pct_retained = double(nWindows),
+    complexity = double(nWindows),
+    Dist_from_reference = double(nWindows),
+    Dist_from_next = double(nWindows)
+  )
+
+   for (i in 1:(nWindows-1) ) {
+
+     # df[i,  Clinic_ymd := blist[i] ]
+     # df[i,  Clinic := unlist(strsplit(blist[i],'_'))[1] ]
+     # df[i,  ymd :=  unlist(strsplit(blist[i],'_'))[2] ]
+
+     df[i,  Role_ymd := as.character(blist[i]) ]
+     df[i,  Role := unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[1] ]
+     df[i,  ymd :=  unlist(strsplit(as.character(blist[i]),'+',fixed=TRUE))[2] ]
+     
+     
+          df[i,  pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ]
+     df[i,  complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ]
+     df[i,  Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ]
+     df[i,  Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ]
+   }
+   
+    
+ # get the ngram data and labels
+    if (keep_ngram_vectors) {
+            b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),])
+            colnames(b_df)=vt_unique$ngrams
+  
+            # stick the ngram frequencies on the end for good measure
+            df = cbind(df,b_df)
+    }
+    
+  # save the result and return it, too
+  save(df, file=paste0(save_file_name,'.rData') )
+  
+  return(df)
 
 }
+##################################################################################
+# e = otr, needs to be set up to include threads that are visit_ID_Role 
+# there must be a 'threadNum" assigned in each row
+#
+# cf is the context factor we want to follow, such as 'Action'
+
+
+get_ngrams_for_buckets  <- function(e,  cf, n_gram_size=2, filter_threshold=2,  save_file_name='deleteme') {
+  
+  
+  
+  # here is the sorted list of buckets
+  blist = sort(unique(e$threadNum))
+  
+  
+  # now many buckets?
+  nb = length(blist)
+  # print(blist)
+  print(paste('Number of buckets =',nb))
+  
+  # make data frame for results
+  vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() )
+  
+  bcount=0
+  # scan through the data
+  for (b in blist){
+    bcount= bcount +1
+    # print(paste('b =',b))
+    
+ 
+    # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
+    th = e[ e[['threadNum']] ==b , ] 
+    
+    if (nrow(th)>filter_threshold) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]    
+    
+    # add the bucket number and name
+    ngdf$bid = bcount
+    ngdf$bname = b
+    
+    # append the columns to the end
+    # vt is the whole set of all ngrams in all the windows
+    vt=rbind(vt,ngdf)
+    }
+  }
+  
+  
+  # convert to factor
+  vt$ngrams = factor(vt$ngrams)
+  
+  # nWindows = length(unique(vt$bid))
+  nWindows = nb
+  #print(paste('nWindows =',nWindows))
+  
+  # get the set of unique ngrams for the whole data set
+  vt_unique = data.frame(ngrams=unique(vt$ngrams))
+  #print(vt_unique)
+  
+  # put the results here
+  windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique))
+  
+  for (i in 1:nWindows){
+    
+    # get the merged listunqi
+    vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE)
+    
+    # use the bid.y to get the whole vector, but replace the NA with zeros
+    bb=vtmerge[vtmerge$bid==i,'freq']
+    bb[is.na(bb)] <- 0
+    
+    windowFreqMatrix[i,]=bb
+    
+    print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0)))
+    
+  }
+  
 
+  
+  # get the ngram data and labels
+    # convert matrix to data frame
+ #   df=as.data.frame(windowFreqMatrix[1:(nWindows-1),])
+    df=as.data.frame(windowFreqMatrix)
+    
+    # add the columnnames
+    colnames(df)=vt_unique$ngrams
+    
+    # add the bucket names
+     df = cbind(as.data.frame(blist), df)
+     colnames(df)[1] <- c("threadNum")
+  
+     
+  
+  # save the result and return it, too
+  #save(df, file=paste0(save_file_name,'.rData') )
+  
+  return(df)
+  
+}
 
+##################################################################################
+graph_trajectory_week  <- function(e, bucket_CFs, cf, n_gram_size=2, reference_day=1, filter_threshold=0,keep_ngram_vectors=FALSE, save_file_name='deleteme') {
+  
+  # add column for bucket_CF if needed
+  bucket_col = newColName(bucket_CFs)
+  if  (!(bucket_col  %in% colnames(e)))  {  e = combineContextFactors(occ,bucket_CFs,bucket_col) }
+  
+  # here is the sorted list of buckets
+  blist = sort(unique(e[[bucket_col]]))
+  
+  # now many buckets?
+  nb = length(blist)
+  # print(blist)
+  print(paste('Number of buckets =',nb))
+  
+  # make data frame for results
+  vt=data.frame( ngrams=character(), freq=integer(), bid=integer(),bname=character() )
+  
+  bcount=0
+  # scan through the data
+  for (b in blist){
+    bcount= bcount +1
+    # print(paste('b =',b))
+    
+    # THIS CHECK NEEDS TO COVER WHOLE CHUNK OF CODE...
+    # get text vector for the whole data set.  Bucket needs at least 2 threads
+    # n_gram_size = 1 for 1-grams, n_gram_size = 2 for pairs.
+    th = e[ e[[bucket_col]] ==b , ] 
+    
+    if (nrow(th)>3) { ngdf = count_ngrams(th, 'threadNum', cf, n_gram_size)[1:2]  }  
+    
+    
+    nodes = length(unique(th[[cf]]))
+    
+    
+    # print(paste('nrow ngdf =',nrow(ngdf)))
+    
+    # add the bucket number and name
+    ngdf$bid = bcount
+    ngdf$bname = b
+    
+    # append the columns to the end
+    # vt is the whole set of all ngrams in all the windows
+    vt=rbind(vt,ngdf)
+  }
+  
+  
+  # convert to factor
+  vt$ngrams = factor(vt$ngrams)
+  
+  # nWindows = length(unique(vt$bid))
+  nWindows = nb
+  #print(paste('nWindows =',nWindows))
+  
+  # get the set of unique ngrams for the whole data set
+  vt_unique = data.frame(ngrams=unique(vt$ngrams))
+  #print(vt_unique)
+  
+  # put the results here
+  windowFreqMatrix = matrix(0,nrow=nWindows, ncol=nrow(vt_unique))
+  
+  for (i in 1:nWindows){
+    
+    # get the merged list
+    vtmerge = merge(x=vt_unique, y=vt[vt$bid==i,], by='ngrams', all.x = TRUE)
+    
+    # use the bid.y to get the whole vector, but replace the NA with zeros
+    bb=vtmerge[vtmerge$bid==i,'freq']
+    bb[is.na(bb)] <- 0
+    
+    windowFreqMatrix[i,]=bb
+    
+    print( paste(i, 'num non-zero=', sum(windowFreqMatrix[i,] > 0)))
+    
+  }
+  
+  # #Make a matrix to filter by where all the entries are 0 < f < 1
+  # fm = windowFreqMatrix/max(windowFreqMatrix)
+  # 
+  # # if the number if below the threshold, set it to zero
+  # windowFreqMatrix[fm<=filter_threshold] = 0
+  
+  df = data.table(
+    Clinic_week = character(nWindows),
+    Clinic = character(nWindows),
+    ymd = character(nWindows),
+    pct_retained = double(nWindows),
+    complexity = double(nWindows),
+    Dist_from_reference = double(nWindows),
+    Dist_from_next = double(nWindows)
+  )
+  
+  for (i in 1:(nWindows-1) ) {
+    df[i,  Clinic_week := blist[i] ]
+    # df[i,  Clinic_ymd := blist[i] ]
+    # df[i,  Clinic := unlist(strsplit(blist[i],'_'))[1] ]
+    # df[i,  ymd :=  unlist(strsplit(blist[i],'_'))[2] ]
+    df[i,  pct_retained := sum((windowFreqMatrix[i,]>0)+(windowFreqMatrix[i+1,]>0)==2)/sum(windowFreqMatrix[i,]>0) ]
+    df[i,  complexity := estimate_task_complexity_index( nodes ,sum(windowFreqMatrix[i,] > 0) ) ]
+    df[i,  Dist_from_reference := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[reference_day,]), method='cosine' ) ]
+    df[i,  Dist_from_next := distance(rbind(windowFreqMatrix[i,],windowFreqMatrix[i+1,]), method='cosine' ) ]
+  }
+  
+  
+  # get the ngram data and labels
+  if (keep_ngram_vectors) {
+    b_df=as.data.frame(windowFreqMatrix[1:(nWindows-1),])
+    colnames(b_df)=vt_unique$ngrams
+    
+    # stick the ngram frequencies on the end for good measure
+    df = cbind(df,b_df)
+  }
+  
+  # save the result and return it, too
+  save(df, file=paste0(save_file_name,'.rData') )
+  
+  return(df)
+  
+}
  
\ No newline at end of file
diff --git a/Data Prep/Comparison_functions.R b/Data Prep/Comparison_functions.R
new file mode 100644
index 0000000..4c962f9
--- /dev/null
+++ b/Data Prep/Comparison_functions.R	
@@ -0,0 +1,50 @@
+##########################################################################################################
+# THREADNET:  Batch processing for larger data sets
+
+# (c) 2017 Michigan State University. This software may be used according to the terms provided in the
+# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
+# Absolutely no warranty!
+##########################################################################################################
+
+
+library(dplyr)
+
+
+
+
+# o is the list of occurrences
+# m is the metric we want to see
+routine_metric <- function(o, cf, m){
+  
+  switch(m,
+         len = nrow(o),
+         lex_size = length(unique(o[[cf]])),
+         first_order = table(o[[cf]]),
+         first_order_sort = sort( table(o[[cf]]), decreasing = T)
+         
+         )
+  
+}
+
+get_range <- function(o, start_date, end_date){
+  
+  o %>% filter(as.Date(tStamp) > start_date & as.Date(tStamp) < end_date)
+  
+}
+
+
+#  get start dates -- just a place to stick some code
+# April 2019
+
+get_start_dates <- function(){
+
+  # first get the roles that are relativeluy frequent
+  role_freq2 = as.data.frame(table(ot$Role_ID)[table(ot$Role_ID)>1000])
+  
+  # then get their start dates
+role_freq2$start_date = sapply(role_freq2$Var1, function(x) {min(as.character(ot$tStamp[ot$Role_ID==as.character(x)]))})
+
+  # then trim it so it's more readable. 
+role_freq2$start_date = substr(role_freq2$start_date,1,10)
+
+}
\ No newline at end of file
diff --git a/ParallelTest/ptest.R b/ParallelTest/ptest.R
new file mode 100644
index 0000000..3957931
--- /dev/null
+++ b/ParallelTest/ptest.R
@@ -0,0 +1,26 @@
+##########################################################################################################
+# THREADNET:  Batch processing for larger data sets
+
+# (c) 2017 Michigan State University. This software may be used according to the terms provided in the
+# GNU General Public License (GPL-3.0) https://opensource.org/licenses/GPL-3.0?
+# Absolutely no warranty!
+##########################################################################################################
+
+library(parallel)
+
+ptest <-function(o) {
+  x<<-0
+  oo<<-o
+  
+  l=1:nrow(o)
+   # return( sapply(l,f1 ) )
+
+  new_o = cbind(o,  sapply(l,f1 ) )
+   return(new_o)
+}
+
+f1 <- function(r){
+  x<<-x+1
+  return(as.character(oo[r,5]))
+}
+
diff --git a/Stats/precedence_functions.R b/Stats/precedence_functions.R
new file mode 100644
index 0000000..888851e
--- /dev/null
+++ b/Stats/precedence_functions.R
@@ -0,0 +1,38 @@
+
+# need to test if this is working correctly
+# It should be counting the number of times that each element in the lexicon precedes each other element.
+
+# make some data
+S=lapply(1:10,function(x) {c('a','b','a','c',sample(letters[5:6],1+round(runif(1)*10),replace=TRUE)) })
+
+# take list of sequences, S
+precedes <- function(S){
+  
+  # Get lexicon
+  lex = sort(unique(unlist(S)))
+  
+  # for each element in the lexicon...
+  result = matrix(data = sapply(lex, function(a){
+    
+      # Find its position(s) in each sequence. pos is a list of vectors
+      pos=sapply(S, function(s) {grep(a,s)})
+    
+      # now for the last position in each string, find the lexical elements that precede it 
+      # get the substrings
+      subS = mapply(function(p,s){if (max(p)>1) s[1:max(p)]},pos, S)
+ 
+      # Now find all of the other lexical elements that precede a in each subS
+      colSums(sapply(lex,function(x){str_detect(subS,x)})) }
+      ) , 
+      
+      ncol=length(lex))
+
+  rownames(result) = lex
+  colnames(result) = lex
+  
+  # normalize by the number of sequences,so "always" = 1
+  result = result/length(S)
+  
+  return(result)
+  
+}
\ No newline at end of file
diff --git a/Tables and Figures/Clinic_day_plots.R b/Tables and Figures/Clinic_day_plots.R
index e0fb1ac..b105277 100644
--- a/Tables and Figures/Clinic_day_plots.R	
+++ b/Tables and Figures/Clinic_day_plots.R	
@@ -34,11 +34,234 @@ plots_for_papers <- function(){
   plot(cds$Clinic.x,cds$A_NetComplexity)
   
   # Clinic change over time, with and without smoothing
-  ggplot(data = cds, aes(x = ymd, y = corr_with_first, group=Clinic)) + geom_line(aes(color=Clinic))
-  ggplot(data = cds, aes(x = ymd, y = rollmean(corr_with_first,5,na.pad=TRUE), group=Clinic)) + geom_line(aes(color=Clinic))
+  ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Clinic)) + geom_line(aes(color=Clinic))
+  ggplot(data = cds, aes(x = ymd, y = rollmean(Dist_from_reference,5,na.pad=TRUE), group=Clinic)) + geom_line(aes(color=Clinic))
   
+  # Dec 6th version 
+  ggplot(data = cdt, aes(x = ymd.x, y = Dist_from_reference, group=Clinic.x)) + 
+    geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank())
+
+     
+    ggplot(data = cds_435_1 %>% filter(Phase=='one'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435_1 %>% filter(Phase=='two'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435_1 %>% filter(Phase=='three'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435_1 %>% filter(Phase=='four'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435_1 %>% filter(Phase=='five'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+  
+  # Here are the pairs
+    ggplot(data = cds_435 %>% filter(Phase=='one'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435 %>% filter(Phase=='two'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435 %>% filter(Phase=='three'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435 %>% filter(Phase=='four'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    ggplot(data = cds_435 %>% filter(Phase=='five'), aes(x = ymd.x, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic.x)) + 
+      geom_line(aes(color=Clinic.x)) + theme(axis.text.x=element_blank())
+    
+    #  these are based on sub-sets  of the data by phase
+    ggplot(data = cdp2, aes(x = ymd, y = rollmean( Dist_from_reference, 5,na.pad=TRUE), group=Clinic)) + 
+      geom_line(aes(color=Clinic)) + theme(axis.text.x=element_blank(), axis.text.y=element_blank())
+    
+    
+    
+  # ROle change over time
+  ggplot(data = cds, aes(x = ymd, y = Dist_from_reference, group=Role)) + geom_line(aes(color=Role))
+  
+  # get the wait times and visit duration for each clinic
+  # convert to minutes. 
+ a= visits  %>% 
+    group_by(Clinic) %>%
+    summarize(n = n(),
+              NEvents = mean(NEvents, na.rm = TRUE),
+              wait1 = mean(wait1*60, na.rm = TRUE),
+              sdwait1 = sd(wait1*60, na.rm = TRUE),
+              wait2 = mean(wait2*60, na.rm = TRUE),
+              sdwait2 = sd(wait2*60, na.rm = TRUE),
+              Duration  = mean(VisitDuration, na.rm = TRUE)
+              )
+  
+ # For some reason, the sd function was not working correctly above...  do it again here. 
+ visits  %>% 
+    group_by(Clinic) %>%
+    summarize(n = n(),
+              sdwait1 = sd(wait1*60, na.rm = TRUE),
+              sdwait2 = sd(wait2*60, na.rm = TRUE)
+    )
+  
+  # get the precentage of actions by role... based on occurrences
+ Clinic_role_pct = otr %>%
+    group_by(Clinic, Role)  %>%
+    summarize(n=n()) %>%
+    mutate(RolePct = n/sum(n))  %>%
+   select(Clinic, Role, RolePct) %>%
+   spread(Role, RolePct)
+    
+ # get LOS by clinic based on visits
+ Clinic_LOS_pct = visits %>%
+   group_by(Clinic, LOS_CPT)  %>%
+   summarize(n=n()) %>%
+   mutate(LOSPct = n/sum(n)) %>%
+   spread(Clinic, Role)
+ 
+ # Make contingency table Clinic x  LOS
+ C_LOS = visits %>%
+   group_by(Clinic, LOS_CPT)  %>%
+   summarize(n=n()) %>%
+   spread( LOS_CPT,n)
+ 
+ # Convert NA to zero and then convert to table. Carefully remove  rows/cols that make no sense
+ C_LOS= as.data.frame(C_LOS)
+ C_LOS[is.na(C_LOS)] =0
+ C_LOS = matrix(unlist(C_LOS), nrow=6, ncol=20)[1:5,2:19]
+ chisq.test(C_LOS)
+ 
+
+# Make contingency table Clinic x  Role
+C_R = otr %>%
+  group_by(Clinic, Role)  %>%
+  summarize(n=n()) %>%
+  spread( Role, n)
+
+# Convert NA to zero
+C_R = as.data.frame(C_R)
+C_R[is.na(C_R)] =0
+C_R = matrix(unlist(C_R), nrow=5, ncol=10)[1:5,2:10]
+chisq.test(C_R)
+
+ Clinic_staffing = otr %>%
+  group_by(Clinic,Role)  %>%
+  summarize(total_staff=n_distinct(Role_ID)) %>%
+  spread( Role, total_staff)
+ 
+ 
+ # Get the handoffs per visit.  Use the distinct chunks in the Visit-Role threads (VRThrds)
+ Role_Handoffs = VRThreads %>%
+   group_by(Clinic,Visit_ID) %>%
+   summarize(ho = n_distinct(threadNumVR))
+
+ Role_Handoffs_by_clinic = Role_Handoffs %>%
+   group_by(Clinic) %>%
+   summarize(avg_ho = mean(ho), 
+             median_ho = median(ho),
+             max_ho = max(ho),
+             sd_ho = sd(ho))
+ 
+ # get the average staffing levels per day
+ Clinic_daily_staffing = otr %>%
+   group_by(Clinic,ymd,Role)  %>%
+   summarize(total_staff=n_distinct(Role_ID)) 
+ 
+ Clinic_AVG_daily_staffing = Clinic_daily_staffing %>%
+   group_by(Clinic,Role)  %>%
+   summarize(AVG_staff=mean(total_staff, na.rm = TRUE))  %>%
+   spread( Role, AVG_staff)
+ 
+ Clinic_AVG_daily_staffing = Clinic_daily_staffing %>%
+   group_by(Clinic,Role)  %>%
+   summarize(AVG_staff=median(total_staff, na.rm = TRUE))  %>%
+   spread( Role, AVG_staff)
+ 
+ # let's look at the actions by clinic_role, using the visit_role threads
+actions_by_role =  VRThreads %>%
+   group_by(Clinic,Role_VR) %>%
+   summarize(avgActions = mean(Action_countVR))    %>%
+   spread( Role_VR, avgActions)
+
+# Look at alignment by clinic
+visits %>%
+  group_by(Clinic,Phase)  %>%
+  summarize(w=max(ThreadDuration,na.rm = TRUE)  )   %>%
+  spread( Phase, w)
+ 
+
+# Get LOS by clinic-day
+LOS_by_clinic_day = visits %>%
+  group_by(Clinic_ymd)  %>%
+  summarize(los=mean(LOS,na.rm = TRUE)  )
+
+
+# See if diagnoses differ by  time period
+Diag_by_phase = visits %>%
+  group_by(Phase,Diagnosis_Group)  %>%
+  summarize(n=n() )  %>%
+  spread( Diagnosis_Group, n)
+
+# See if actions differ by time period
+#  cds_435_1 is a dataframe with one for for each clinic_day and ~300 columns for the actions
+Action_by_phase = cds_435_1 %>%
+  group_by(Phase)  %>%
+  summarize_at(8:307,median ) 
+
 }
 
+
+remove_outliers <- function(x, na.rm = TRUE, ...) {
+  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
+  H <- 1.5 * IQR(x, na.rm = na.rm)
+  y <- x
+  y[x < (qnt[1] - H)] <- NA
+  y[x > (qnt[2] + H)] <- NA
+  y
+}
+
+# converts visit_ID into character string with  factor  levels for actions
+make_visit_string <- function(occ, vid){
+  
+  return(paste(as.character(as.numeric(occ$Action[occ$Visit_ID==as.character(vid)])),collapse = '-'))
+}
+
+# Converts list of actions into regex of factor levels  
+make_process_pattern <- function(lvls,  p){
+  
+  return( paste( sapply(p, function(x) {as.character(which(lvls==x))}) , collapse = '.*') )
+}
+
+# Find process patterns
+find_process_pattern <- function(occ,  p, vid){
+  
+  x = make_process_pattern( levels(ot$Action),  p)
+  print(x)
+  
+   m = sapply(vid, function(v) {
+                              (grepl( x, make_visit_string(occ, v))) } )
+  # 
+  
+
+#  m = grep( x, make_visit_string(occ, vid))
+  
+  return(m)
+  
+}
+
+dualWindowCosine <- function(df, clinic_col, colrange, windowsize){
+  
+  
+  rdf = data.frame(clinic = df[windowsize:(nrow(df)-windowsize),..clinic_col],
+                   ymd =  df[windowsize:(nrow(df)-windowsize),ymd],
+                   corr = sapply(windowsize:(nrow(df)-windowsize), 
+                                 function(x) { 
+                                   distance(rbind( colSums(df[(x-windowsize+1):x,..colrange]),colSums(df[(x+1):(x+windowsize), ..colrange ]) ), 
+                                            method='cosine' ) })
+  )
+  return(rdf)
+}
+
+
 # 
 # make_box_plots <- function(){
 # ggboxplot(ACHR_test[NEvents>100 & Clinic=='DRH'], x = "VisitMonth", y = "NetComplexity",