jalammar · SumanthRH · Jan 22, 2024 · Jan 22, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,11 @@
-matplotlib~=3.3.1
-numpy~=1.19.1
-ipython~=7.16.1
-scikit-learn~=0.24.2
-seaborn~=0.11.0
-transformers~=4.6.1
-pytest~=6.1.2
-setuptools~=49.6.0
-torch~=1.9.0
-PyYAML==5.4.1
-captum==0.4.1
+matplotlib==3.8.2
+numpy==1.26.2
+ipython==8.18.1
+scikit-learn==1.3.2
+seaborn==0.13.0
+transformers==4.36.2
+pytest==7.4.3
+setuptools==68.2.2
+torch==2.1.1
+PyYAML==6.0.1
+captum==0.6.0
diff --git a/setup.py b/setup.py
@@ -65,11 +65,11 @@ def read(*names, **kwargs):
     ],
     python_requires='!=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*',
     install_requires=[
-        "transformers ~= 4.2",
-        "seaborn ~= 0.11",
-        "scikit-learn~=0.23",
-        "PyYAML~=5.4",
-        "captum ~= 0.4"
+        "transformers ~= 4.36",
+        "seaborn ~= 0.13",
+        "scikit-learn ~= 1.3",
+        "PyYAML ~= 6.0",
+        "captum ~= 0.6"
     ],
     extras_require={
         "dev": [

diff --git a/src/ecco/__init__.py b/src/ecco/__init__.py
@@ -26,7 +26,8 @@ def from_pretrained(hf_model_id: str,
                     hidden_states: Optional[bool] = True,
                     activations_layer_nums: Optional[List[int]] = None,
                     verbose: Optional[bool] = True,
-                    gpu: Optional[bool] = True
+                    gpu: Optional[bool] = True,
+                    **model_kwargs: Dict[str, Any]
                     ):
     """
     Constructs a [LM][ecco.lm.LM] object based on a string identifier from HuggingFace Transformers. This is
@@ -80,7 +81,7 @@ def from_pretrained(hf_model_id: str,
     else:
         model_cls = AutoModel
 
-    model = model_cls.from_pretrained(hf_model_id, output_hidden_states=hidden_states, output_attentions=attention)
+    model = model_cls.from_pretrained(hf_model_id, output_hidden_states=hidden_states, output_attentions=attention, **model_kwargs)
 
     lm_kwargs = {
         'model_name': hf_model_id,

diff --git a/src/ecco/lm.py b/src/ecco/lm.py
@@ -67,10 +67,6 @@ def __init__(self,
         if torch.cuda.is_available() and gpu:
             self.model = model.to('cuda')
 
-        self.device = 'cuda' if torch.cuda.is_available() \
-                                and self.model.device.type == 'cuda' \
-            else 'cpu'
-
         self.tokenizer = tokenizer
         self.verbose = verbose
         self._path = os.path.dirname(ecco.__file__)
@@ -104,6 +100,10 @@ def __init__(self,
         # we're running it before every d.HTML cell
         # d.display(d.HTML(filename=os.path.join(self._path, "html", "setup.html")))
 
+    @property
+    def device(self):
+        return self.model.device
+
     def _reset(self):
         self._all_activations_dict = defaultdict(dict)
         self.activations = defaultdict(dict)
@@ -114,9 +114,7 @@ def _reset(self):
         self._hooks = {}
 
     def to(self, tensor: Union[torch.Tensor, BatchEncoding]):
-        if self.device == 'cuda':
-            return tensor.to('cuda')
-        return tensor
+        return tensor.to(self.device)
 
     def _analyze_token(self,
                        encoder_input_embeds: torch.Tensor,
@@ -143,7 +141,7 @@ def _analyze_token(self,
                         'decoder_inputs_embeds': decoder_input_embeds
                     },
                     prediction_id=prediction_id
-                ).cpu().detach().numpy()
+                ).float().cpu().detach().numpy() # cast to float32 before numpy conversion
             )
 
     def generate(self, input_str: str,
@@ -521,7 +519,7 @@ def _get_embeddings(self, input_ids) -> Tuple[torch.FloatTensor, torch.FloatTens
 
         vocab_size = embedding_matrix.shape[0]
 
-        one_hot_tensor = self.to(_one_hot_batched(input_ids, vocab_size))
+        one_hot_tensor = self.to(_one_hot_batched(input_ids, vocab_size)).to(self.model.dtype)
         token_ids_tensor_one_hot = one_hot_tensor.clone().requires_grad_(True)
 
         inputs_embeds = torch.matmul(token_ids_tensor_one_hot, embedding_matrix)
@@ -576,7 +574,7 @@ def _get_activations_hook(self, name: str, input_):
         # (?<=\.) means look for a period before the int
         # \d+ means look for one or multiple digits
         # (?=\.) means look for a period after the int
-        layer_number = re.search("(?<=\.)\d+(?=\.)", name).group(0)
+        layer_number = re.search(r"(?<=\.)\d+(?=\.)", name).group(0)
         layer_type = 'encoder' if name.startswith('encoder.') else 'decoder'
         # print("layer number: ", layer_number)
 
@@ -593,15 +591,15 @@ def _get_activations_hook(self, name: str, input_):
             # overwrite the previous step activations. This collects all activations in the last step
             # Assuming all input tokens are presented as input, no "past"
             # The inputs to c_proj already pass through the gelu activation function
-            self._all_activations_dict[layer_type][layer_number] = input_[0].detach().cpu().numpy()
+            self._all_activations_dict[layer_type][layer_number] = input_[0].detach().float().cpu().numpy()
 
     def _inhibit_neurons_hook(self, name: str, input_tensor):
         """
         After being attached as a pre-forward hook, it sets to zero the activation value
         of the neurons indicated in self.neurons_to_inhibit
         """
 
-        layer_number = re.search("(?<=\.)\d+(?=\.)", name).group(0)
+        layer_number = re.search(r"(?<=\.)\d+(?=\.)", name).group(0)
         if layer_number in self.neurons_to_inhibit.keys():
             # print('layer_number', layer_number, input_tensor[0].shape)
 
@@ -729,10 +727,11 @@ def sample_output_token(scores, do_sample, temperature, top_k, top_p):
         if temperature != 1.0:
             scores = scores / temperature
         # Top-p/top-k filtering
-        next_token_logscores = transformers.generation_utils. \
-            top_k_top_p_filtering(scores,
-                                  top_k=top_k,
-                                  top_p=top_p)
+        next_token_logscores = transformers.top_k_top_p_filtering(
+                scores,
+                top_k=top_k,
+                top_p=top_p
+        )
         # Sample
         probs = F.softmax(next_token_logscores, dim=-1)
 

diff --git a/src/ecco/model-config.yaml b/src/ecco/model-config.yaml
@@ -342,3 +342,26 @@ EleutherAI/gpt-neo-2.7B:
     - 'mlp\.c_proj'
     token_prefix: ' '
     partial_token_prefix: ''
+
+# Llama
+openlm-research/open_llama_3b:
+    embedding: "model.embed_tokens"
+    type: 'causal'
+    activations: 
+    - 'mlp\.up_proj' #This is a regex
+    token_prefix: '▁'
+    partial_token_prefix: ''
+meta-llama/Llama-2-7b:
+    embedding: "model.embed_tokens"
+    type: 'causal'
+    activations: 
+    - 'mlp\.up_proj' #This is a regex
+    token_prefix: '▁'
+    partial_token_prefix: ''
+meta-llama/Llama-2-13b:
+    embedding: "model.embed_tokens"
+    type: 'causal'
+    activations: 
+     - 'mlp\.up_proj' #This is a regex
+    token_prefix: '▁'
+    partial_token_prefix: ''
diff --git a/src/ecco/output.py b/src/ecco/output.py
@@ -112,9 +112,7 @@ def __str__(self):
         return "<LMOutput '{}' # of lm outputs: {}>".format(self.output_text, len(self._get_hidden_states()[1][-1]))
 
     def to(self, tensor: torch.Tensor):
-        if self.device == 'cuda':
-            return tensor.to('cuda')
-        return tensor
+        return tensor.to(self.device)
 
     def explorable(self, printJson: Optional[bool] = False):
 
@@ -394,7 +392,7 @@ def layer_predictions(self, position: int = 1, topk: Optional[int] = 10, layer:
 
             layer_top_tokens = [self.tokenizer.decode(t) for t in sorted_softmax[-k:]][::-1]
             top_tokens.append(layer_top_tokens)
-            layer_probs = softmax[sorted_softmax[-k:]].cpu().detach().numpy()[::-1]
+            layer_probs = softmax[sorted_softmax[-k:]].float().cpu().detach().numpy()[::-1]
             probs.append(layer_probs.tolist())
 
             # Package in output format

diff --git a/tests/lm_test.py b/tests/lm_test.py
@@ -2,7 +2,7 @@
 import ecco
 import torch
 import numpy as np
-from transformers import PreTrainedModel 
+from transformers import PreTrainedModel
 
 
 class TestLM:
@@ -58,6 +58,10 @@ def test_call_dummy_bert(self):
         # If we do require padding, this CUDA compains with this model for some reason.
         assert output.activations['encoder'].shape == (2, 1, 40, 3)
 
+    def test_half_prec(self):
+        # pass model kwargs
+        lm = ecco.from_pretrained('sshleifer/tiny-gpt2', activations=True, torch_dtype=torch.bfloat16)
+        assert lm.model.dtype == torch.bfloat16, f"Model dtype should be Bfloat16, got {lm.model.dtype}"
 
     # TODO: Test LM Generate with Activation. Tweak to support batch dimension.
     # def test_generate_token_no_attribution(self, mocker):