Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Improvements to tree parsing speed #3508

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 1, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 50 additions & 15 deletions src/tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,26 @@ int git_tree_entry_icmp(const git_tree_entry *e1, const git_tree_entry *e2)
git__strncasecmp);
}

static git_tree_entry *alloc_entry(const char *filename)
/**
* Allocate either from the pool or from the system allocator
*/
static git_tree_entry *alloc_entry_base(git_pool *pool, const char *filename, size_t filename_len)
{
git_tree_entry *entry = NULL;
size_t filename_len = strlen(filename), tree_len;
size_t tree_len;

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a check here to make sure that filename_len fits in a uint16_t? I don't see any obvious vulnerabilities if one were to somehow build a tree that had a longer filename in it, but I'm also not the most creative person in the owrld.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Absolutely, I've pushed up a commit which does this and de-duplicates the size and overflow checks.

if (filename_len > UINT16_MAX) {
giterr_set(GITERR_INVALID, "tree entry is over UINT16_MAX in length");
return NULL;
}

if (GIT_ADD_SIZET_OVERFLOW(&tree_len, sizeof(git_tree_entry), filename_len) ||
GIT_ADD_SIZET_OVERFLOW(&tree_len, tree_len, 1) ||
!(entry = git__malloc(tree_len)))
GIT_ADD_SIZET_OVERFLOW(&tree_len, tree_len, 1))
return NULL;

entry = pool ? git_pool_malloc(pool, tree_len) :
git__malloc(tree_len);
if (!entry)
return NULL;

memset(entry, 0x0, sizeof(git_tree_entry));
Expand All @@ -99,9 +111,31 @@ static git_tree_entry *alloc_entry(const char *filename)
return entry;
}

/**
* Allocate a tree entry, using the poolin the tree which owns
* it. This is useful when reading trees, so we don't allocate a ton
* of small strings but can use the pool.
*/
static git_tree_entry *alloc_entry_pooled(git_pool *pool, const char *filename, size_t filename_len)
{
git_tree_entry *entry = NULL;

if (!(entry = alloc_entry_base(pool, filename, filename_len)))
return NULL;

entry->pooled = true;

return entry;
}

static git_tree_entry *alloc_entry(const char *filename)
{
return alloc_entry_base(NULL, filename, strlen(filename));
}

struct tree_key_search {
const char *filename;
size_t filename_len;
uint16_t filename_len;
};

static int homing_search_cmp(const void *key, const void *array_member)
Expand Down Expand Up @@ -198,7 +232,7 @@ static int tree_key_search(

void git_tree_entry_free(git_tree_entry *entry)
{
if (entry == NULL)
if (entry == NULL || entry->pooled)
return;

git__free(entry);
Expand Down Expand Up @@ -233,6 +267,7 @@ void git_tree__free(void *_tree)
git_tree_entry_free(e);

git_vector_free(&tree->entries);
git_pool_clear(&tree->pool);
git__free(tree);
}

Expand Down Expand Up @@ -385,11 +420,14 @@ int git_tree__parse(void *_tree, git_odb_object *odb_obj)
const char *buffer = git_odb_object_data(odb_obj);
const char *buffer_end = buffer + git_odb_object_size(odb_obj);

git_pool_init(&tree->pool, 1);
if (git_vector_init(&tree->entries, DEFAULT_TREE_SIZE, entry_sort_cmp) < 0)
return -1;

while (buffer < buffer_end) {
git_tree_entry *entry;
size_t filename_len;
const char *nul;
int attr;

if (git__strtol32(&attr, buffer, &buffer, 8) < 0 || !buffer)
Expand All @@ -398,26 +436,23 @@ int git_tree__parse(void *_tree, git_odb_object *odb_obj)
if (*buffer++ != ' ')
return tree_error("Failed to parse tree. Object is corrupted", NULL);

if (memchr(buffer, 0, buffer_end - buffer) == NULL)
if ((nul = memchr(buffer, 0, buffer_end - buffer)) == NULL)
return tree_error("Failed to parse tree. Object is corrupted", NULL);

filename_len = nul - buffer;
/** Allocate the entry and store it in the entries vector */
{
entry = alloc_entry(buffer);
entry = alloc_entry_pooled(&tree->pool, buffer, filename_len);
GITERR_CHECK_ALLOC(entry);

if (git_vector_insert(&tree->entries, entry) < 0) {
git__free(entry);
if (git_vector_insert(&tree->entries, entry) < 0)
return -1;
}

entry->attr = attr;
}

while (buffer < buffer_end && *buffer != 0)
buffer++;

buffer++;
/* Advance to the ID just after the path */
buffer += filename_len + 1;

git_oid_fromraw(&entry->oid, (const unsigned char *)buffer);
buffer += GIT_OID_RAWSZ;
Expand Down
7 changes: 5 additions & 2 deletions src/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,20 @@
#include "odb.h"
#include "vector.h"
#include "strmap.h"
#include "pool.h"

struct git_tree_entry {
uint16_t attr;
uint16_t filename_len;
git_oid oid;
size_t filename_len;
char filename[1];
bool pooled;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This mis-aligns the struct packing, which is not neat for pool allocations. I think we could very easily change filename_len to uint16_t and get a couple extra bytes to play with.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does indeed bring sadness to alignment, I forgot to double-check after testing it out. It's probably enough to have a uint16_t as length for anything we would actually care to support, tbh (and lol if you actually need the 64 bits for your path length).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed to 16-bit and avoided any padding (except for the final one due to the zero-length array). This brings the size down from 32 to 26 bytes. It doesn't seem to make a difference in speed one way or another.

char filename[GIT_FLEX_ARRAY];
};

struct git_tree {
git_object object;
git_vector entries;
git_pool pool;
};

struct git_treebuilder {
Expand Down