-
Notifications
You must be signed in to change notification settings - Fork 15k
[IR2Vec] Refactor vocabulary to use section-based storage #158376
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
svkeerthy
wants to merge
1
commit into
users/svkeerthy/09-03-support_predicates
Choose a base branch
from
users/svkeerthy/09-12-vocabstorage
base: users/svkeerthy/09-03-support_predicates
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+571
−122
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,7 @@ | |
#include "llvm/Support/JSON.h" | ||
#include <array> | ||
#include <map> | ||
#include <optional> | ||
|
||
namespace llvm { | ||
|
||
|
@@ -144,6 +145,73 @@ struct Embedding { | |
using InstEmbeddingsMap = DenseMap<const Instruction *, Embedding>; | ||
using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>; | ||
|
||
/// Generic storage class for section-based vocabularies. | ||
/// VocabStorage provides a generic foundation for storing and accessing | ||
/// embeddings organized into sections. | ||
class VocabStorage { | ||
private: | ||
/// Section-based storage | ||
std::vector<std::vector<Embedding>> Sections; | ||
|
||
size_t TotalSize = 0; | ||
unsigned Dimension = 0; | ||
|
||
public: | ||
/// Default constructor creates empty storage (invalid state) | ||
VocabStorage() : Sections(), TotalSize(0), Dimension(0) {} | ||
|
||
/// Create a VocabStorage with pre-organized section data | ||
VocabStorage(std::vector<std::vector<Embedding>> &&SectionData); | ||
|
||
VocabStorage(VocabStorage &&) = default; | ||
VocabStorage &operator=(VocabStorage &&Other); | ||
|
||
VocabStorage(const VocabStorage &) = delete; | ||
VocabStorage &operator=(const VocabStorage &) = delete; | ||
|
||
/// Get total number of entries across all sections | ||
size_t size() const { return TotalSize; } | ||
|
||
/// Get number of sections | ||
unsigned getNumSections() const { | ||
return static_cast<unsigned>(Sections.size()); | ||
} | ||
|
||
/// Section-based access: Storage[sectionId][localIndex] | ||
const std::vector<Embedding> &operator[](unsigned SectionId) const { | ||
assert(SectionId < Sections.size() && "Invalid section ID"); | ||
return Sections[SectionId]; | ||
} | ||
|
||
/// Get vocabulary dimension | ||
unsigned getDimension() const { return Dimension; } | ||
|
||
/// Check if vocabulary is valid (has data) | ||
bool isValid() const { return TotalSize > 0; } | ||
|
||
/// Iterator support for section-based access | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you need this? |
||
class const_iterator { | ||
const VocabStorage *Storage; | ||
unsigned SectionId; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. initialize SectionId and LocalIndex at declaration. |
||
size_t LocalIndex; | ||
|
||
public: | ||
const_iterator(const VocabStorage *Storage, unsigned SectionId, | ||
size_t LocalIndex) | ||
: Storage(Storage), SectionId(SectionId), LocalIndex(LocalIndex) {} | ||
|
||
LLVM_ABI const Embedding &operator*() const; | ||
LLVM_ABI const_iterator &operator++(); | ||
LLVM_ABI bool operator==(const const_iterator &Other) const; | ||
LLVM_ABI bool operator!=(const const_iterator &Other) const; | ||
}; | ||
|
||
const_iterator begin() const { return const_iterator(this, 0, 0); } | ||
const_iterator end() const { | ||
return const_iterator(this, getNumSections(), 0); | ||
} | ||
}; | ||
|
||
/// Class for storing and accessing the IR2Vec vocabulary. | ||
/// The Vocabulary class manages seed embeddings for LLVM IR entities. The | ||
/// seed embeddings are the initial learned representations of the entities | ||
|
@@ -164,7 +232,7 @@ using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>; | |
class Vocabulary { | ||
friend class llvm::IR2VecVocabAnalysis; | ||
|
||
// Vocabulary Slot Layout: | ||
// Vocabulary Layout: | ||
// +----------------+------------------------------------------------------+ | ||
// | Entity Type | Index Range | | ||
// +----------------+------------------------------------------------------+ | ||
|
@@ -175,8 +243,16 @@ class Vocabulary { | |
// Note: "Similar" LLVM Types are grouped/canonicalized together. | ||
// Operands include Comparison predicates (ICmp/FCmp). | ||
// This can be extended to include other specializations in future. | ||
using VocabVector = std::vector<ir2vec::Embedding>; | ||
VocabVector Vocab; | ||
enum class Section : unsigned { | ||
Opcodes = 0, | ||
CanonicalTypes = 1, | ||
Operands = 2, | ||
Predicates = 3, | ||
MaxSections | ||
}; | ||
|
||
// Use section-based storage for better organization and efficiency | ||
VocabStorage Storage; | ||
|
||
static constexpr unsigned NumICmpPredicates = | ||
static_cast<unsigned>(CmpInst::LAST_ICMP_PREDICATE) - | ||
|
@@ -228,9 +304,18 @@ class Vocabulary { | |
NumICmpPredicates + NumFCmpPredicates; | ||
|
||
Vocabulary() = default; | ||
LLVM_ABI Vocabulary(VocabVector &&Vocab) : Vocab(std::move(Vocab)) {} | ||
LLVM_ABI Vocabulary(VocabStorage &&Storage) : Storage(std::move(Storage)) {} | ||
|
||
Vocabulary(const Vocabulary &) = delete; | ||
Vocabulary &operator=(const Vocabulary &) = delete; | ||
|
||
Vocabulary(Vocabulary &&) = default; | ||
Vocabulary &operator=(Vocabulary &&Other); | ||
|
||
LLVM_ABI bool isValid() const { | ||
return Storage.size() == NumCanonicalEntries; | ||
} | ||
|
||
LLVM_ABI bool isValid() const { return Vocab.size() == NumCanonicalEntries; }; | ||
LLVM_ABI unsigned getDimension() const; | ||
/// Total number of entries (opcodes + canonicalized types + operand kinds + | ||
/// predicates) | ||
|
@@ -251,12 +336,11 @@ class Vocabulary { | |
/// Function to get vocabulary key for a given predicate | ||
LLVM_ABI static StringRef getVocabKeyForPredicate(CmpInst::Predicate P); | ||
|
||
/// Functions to return the slot index or position of a given Opcode, TypeID, | ||
/// or OperandKind in the vocabulary. | ||
LLVM_ABI static unsigned getSlotIndex(unsigned Opcode); | ||
LLVM_ABI static unsigned getSlotIndex(Type::TypeID TypeID); | ||
LLVM_ABI static unsigned getSlotIndex(const Value &Op); | ||
LLVM_ABI static unsigned getSlotIndex(CmpInst::Predicate P); | ||
/// Functions to return flat index | ||
LLVM_ABI static unsigned getIndex(unsigned Opcode); | ||
LLVM_ABI static unsigned getIndex(Type::TypeID TypeID); | ||
LLVM_ABI static unsigned getIndex(const Value &Op); | ||
LLVM_ABI static unsigned getIndex(CmpInst::Predicate P); | ||
|
||
/// Accessors to get the embedding for a given entity. | ||
LLVM_ABI const ir2vec::Embedding &operator[](unsigned Opcode) const; | ||
|
@@ -265,34 +349,29 @@ class Vocabulary { | |
LLVM_ABI const ir2vec::Embedding &operator[](CmpInst::Predicate P) const; | ||
|
||
/// Const Iterator type aliases | ||
using const_iterator = VocabVector::const_iterator; | ||
using const_iterator = VocabStorage::const_iterator; | ||
|
||
const_iterator begin() const { | ||
assert(isValid() && "IR2Vec Vocabulary is invalid"); | ||
return Vocab.begin(); | ||
return Storage.begin(); | ||
} | ||
|
||
const_iterator cbegin() const { | ||
assert(isValid() && "IR2Vec Vocabulary is invalid"); | ||
return Vocab.cbegin(); | ||
} | ||
const_iterator cbegin() const { return begin(); } | ||
|
||
const_iterator end() const { | ||
assert(isValid() && "IR2Vec Vocabulary is invalid"); | ||
return Vocab.end(); | ||
return Storage.end(); | ||
} | ||
|
||
const_iterator cend() const { | ||
assert(isValid() && "IR2Vec Vocabulary is invalid"); | ||
return Vocab.cend(); | ||
} | ||
const_iterator cend() const { return end(); } | ||
|
||
/// Returns the string key for a given index position in the vocabulary. | ||
/// This is useful for debugging or printing the vocabulary. Do not use this | ||
/// for embedding generation as string based lookups are inefficient. | ||
LLVM_ABI static StringRef getStringKey(unsigned Pos); | ||
|
||
/// Create a dummy vocabulary for testing purposes. | ||
LLVM_ABI static VocabVector createDummyVocabForTest(unsigned Dim = 1); | ||
LLVM_ABI static VocabStorage createDummyVocabForTest(unsigned Dim = 1); | ||
|
||
LLVM_ABI bool invalidate(Module &M, const PreservedAnalyses &PA, | ||
ModuleAnalysisManager::Invalidator &Inv) const; | ||
|
@@ -301,12 +380,16 @@ class Vocabulary { | |
constexpr static unsigned NumCanonicalEntries = | ||
MaxOpcodes + MaxCanonicalTypeIDs + MaxOperandKinds + MaxPredicateKinds; | ||
|
||
// Base offsets for slot layout to simplify index computation | ||
// Base offsets for flat index computation | ||
constexpr static unsigned OperandBaseOffset = | ||
MaxOpcodes + MaxCanonicalTypeIDs; | ||
constexpr static unsigned PredicateBaseOffset = | ||
OperandBaseOffset + MaxOperandKinds; | ||
|
||
/// Functions for predicate index calculations | ||
static unsigned getPredicateLocalIndex(CmpInst::Predicate P); | ||
static CmpInst::Predicate getPredicateFromLocalIndex(unsigned LocalIndex); | ||
|
||
/// String mappings for CanonicalTypeID values | ||
static constexpr StringLiteral CanonicalTypeNames[] = { | ||
"FloatTy", "VoidTy", "LabelTy", "MetadataTy", | ||
|
@@ -452,22 +535,22 @@ class LLVM_ABI FlowAwareEmbedder : public Embedder { | |
/// mapping between an entity of the IR (like opcode, type, argument, etc.) and | ||
/// its corresponding embedding. | ||
class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> { | ||
using VocabVector = std::vector<ir2vec::Embedding>; | ||
using VocabMap = std::map<std::string, ir2vec::Embedding>; | ||
VocabMap OpcVocab, TypeVocab, ArgVocab; | ||
VocabVector Vocab; | ||
std::optional<ir2vec::VocabStorage> Vocab; | ||
|
||
Error readVocabulary(); | ||
Error readVocabulary(VocabMap &OpcVocab, VocabMap &TypeVocab, | ||
VocabMap &ArgVocab); | ||
Error parseVocabSection(StringRef Key, const json::Value &ParsedVocabValue, | ||
VocabMap &TargetVocab, unsigned &Dim); | ||
void generateNumMappedVocab(); | ||
void generateVocabStorage(VocabMap &OpcVocab, VocabMap &TypeVocab, | ||
VocabMap &ArgVocab); | ||
void emitError(Error Err, LLVMContext &Ctx); | ||
|
||
public: | ||
LLVM_ABI static AnalysisKey Key; | ||
IR2VecVocabAnalysis() = default; | ||
LLVM_ABI explicit IR2VecVocabAnalysis(const VocabVector &Vocab); | ||
LLVM_ABI explicit IR2VecVocabAnalysis(VocabVector &&Vocab); | ||
LLVM_ABI explicit IR2VecVocabAnalysis(ir2vec::VocabStorage &&Vocab) | ||
: Vocab(std::move(Vocab)) {} | ||
using Result = ir2vec::Vocabulary; | ||
LLVM_ABI Result run(Module &M, ModuleAnalysisManager &MAM); | ||
}; | ||
|
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if you don't need operator=, delete it. that also lets you have internals const-ed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same for the move ctor.