-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinference_qa_qwen.py
More file actions
119 lines (99 loc) · 4.38 KB
/
Copy pathinference_qa_qwen.py
File metadata and controls
119 lines (99 loc) · 4.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import json
import os
from analyze_qa import analyze
CKPT_HOME = "."
MODELS = [
"Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2.5-VL-72B-Instruct",
"Qwen/Qwen2.5-VL-7B-Instruct",
]
if __name__ == "__main__":
input_directory = "./test_videos"
output_directory = "./output_qa"
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
model_path = f"{CKPT_HOME}/{model_name.split('/')[1]}"
os.makedirs(output_directory, exist_ok=True)
output_filename = f"{output_directory}/{model_name.split('/')[1]}.jsonl"
print("=======")
print(f"model_path: {model_path}")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_path)
with open('video_perspective.json', 'rb') as f:
data = json.load(f)
output_dict = {}
if os.path.exists(output_filename):
with open(output_filename, 'r') as f:
for line in f:
output_dict.update(json.loads(line))
for item in data:
video_name = item["video_name"]
if video_name in output_dict:
continue
video_path = os.path.join(input_directory, video_name+'.mp4')
value_list = []
for question in item['questions']:
task_type = question['task_type']
correct_answer = question['correct_answer']
options = question['options']
prompt = f"Carefully watch the video and pay attention to temporal dynamics in this video, focusing on the camera motions, actions, activities, and interactions. Based on your observations, select the best option that accurately addresses the question.\n{question['question']}\nYou can only response with the answer among {question['options']}"
messages = [
{
"role": "user",
"content": [
{"type": "video", "video": video_path, "max_pixels": 360 * 420, "fps": 1.0,},
{"type": "text", "text": prompt},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
output_text = output_text[0]
containing_options = [opt for opt in options if opt != correct_answer and correct_answer in opt]
if not containing_options:
if correct_answer.lower() in output_text.lower():
judge = True
else:
judge = False
else:
if correct_answer.lower() in output_text.lower():
judge = True
for option in containing_options:
if option.lower() in output_text.lower():
judge = False
else:
judge = False
value_list.append({'task_type':task_type, 'correct_answer':correct_answer, 'output':output_text, 'judge':judge})
basename,_ = os.path.splitext(os.path.basename(video_path))
print("=======")
print(basename)
print(value_list)
new_item = {basename:value_list}
with open(output_filename, 'a', encoding='utf-8') as output_file:
json.dump(new_item, output_file, ensure_ascii=False)
output_file.write('\n')
analyze(output_filename)