1Blip2ForConditionalGeneration(
2 (vision_model): Blip2VisionModel(
3 (embeddings): Blip2VisionEmbeddings(
4 (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
5 )
6 (encoder): Blip2Encoder(
7 (layers): ModuleList(
8 (0-38): 39 x Blip2EncoderLayer(
9 (self_attn): Blip2Attention(
10 (qkv): Linear(in_features=1408, out_features=4224, bias=True)
11 (projection): Linear(in_features=1408, out_features=1408, bias=True)
12 )
13 (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
14 (mlp): Blip2MLP(
15 (activation_fn): GELUActivation()
16 (fc1): Linear(in_features=1408, out_features=6144, bias=True)
17 (fc2): Linear(in_features=6144, out_features=1408, bias=True)
18 )
19 (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
20 )
21 )
22 )
23 (post_layernorm): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
24 )
25 (qformer): Blip2QFormerModel(
26 (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
27 (dropout): Dropout(p=0.1, inplace=False)
28 (encoder): Blip2QFormerEncoder(
29 (layer): ModuleList(
30 (0): Blip2QFormerLayer(
31 (attention): Blip2QFormerAttention(
32 (attention): Blip2QFormerMultiHeadAttention(
33 (query): Linear(in_features=768, out_features=768, bias=True)
34 (key): Linear(in_features=768, out_features=768, bias=True)
35 (value): Linear(in_features=768, out_features=768, bias=True)
36 (dropout): Dropout(p=0.1, inplace=False)
37 )
38 (output): Blip2QFormerSelfOutput(
39 (dense): Linear(in_features=768, out_features=768, bias=True)
40 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
41 (dropout): Dropout(p=0.1, inplace=False)
42 )
43 )
44 (crossattention): Blip2QFormerAttention(
45 (attention): Blip2QFormerMultiHeadAttention(
46 (query): Linear(in_features=768, out_features=768, bias=True)
47 (key): Linear(in_features=1408, out_features=768, bias=True)
48 (value): Linear(in_features=1408, out_features=768, bias=True)
49 (dropout): Dropout(p=0.1, inplace=False)
50 )
51 (output): Blip2QFormerSelfOutput(
52 (dense): Linear(in_features=768, out_features=768, bias=True)
53 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
54 (dropout): Dropout(p=0.1, inplace=False)
55 )
56 )
57 (intermediate_query): Blip2QFormerIntermediate(
58 (dense): Linear(in_features=768, out_features=3072, bias=True)
59 (intermediate_act_fn): GELUActivation()
60 )
61 (output_query): Blip2QFormerOutput(
62 (dense): Linear(in_features=3072, out_features=768, bias=True)
63 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
64 (dropout): Dropout(p=0.1, inplace=False)
65 )
66 )
67 (1): Blip2QFormerLayer(
68 (attention): Blip2QFormerAttention(
69 (attention): Blip2QFormerMultiHeadAttention(
70 (query): Linear(in_features=768, out_features=768, bias=True)
71 (key): Linear(in_features=768, out_features=768, bias=True)
72 (value): Linear(in_features=768, out_features=768, bias=True)
73 (dropout): Dropout(p=0.1, inplace=False)
74 )
75 (output): Blip2QFormerSelfOutput(
76 (dense): Linear(in_features=768, out_features=768, bias=True)
77 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
78 (dropout): Dropout(p=0.1, inplace=False)
79 )
80 )
81 (intermediate_query): Blip2QFormerIntermediate(
82 (dense): Linear(in_features=768, out_features=3072, bias=True)
83 (intermediate_act_fn): GELUActivation()
84 )
85 (output_query): Blip2QFormerOutput(
86 (dense): Linear(in_features=3072, out_features=768, bias=True)
87 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
88 (dropout): Dropout(p=0.1, inplace=False)
89 )
90 )
91 (2): Blip2QFormerLayer(
92 (attention): Blip2QFormerAttention(
93 (attention): Blip2QFormerMultiHeadAttention(
94 (query): Linear(in_features=768, out_features=768, bias=True)
95 (key): Linear(in_features=768, out_features=768, bias=True)
96 (value): Linear(in_features=768, out_features=768, bias=True)
97 (dropout): Dropout(p=0.1, inplace=False)
98 )
99 (output): Blip2QFormerSelfOutput(
100 (dense): Linear(in_features=768, out_features=768, bias=True)
101 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
102 (dropout): Dropout(p=0.1, inplace=False)
103 )
104 )
105 (crossattention): Blip2QFormerAttention(
106 (attention): Blip2QFormerMultiHeadAttention(
107 (query): Linear(in_features=768, out_features=768, bias=True)
108 (key): Linear(in_features=1408, out_features=768, bias=True)
109 (value): Linear(in_features=1408, out_features=768, bias=True)
110 (dropout): Dropout(p=0.1, inplace=False)
111 )
112 (output): Blip2QFormerSelfOutput(
113 (dense): Linear(in_features=768, out_features=768, bias=True)
114 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
115 (dropout): Dropout(p=0.1, inplace=False)
116 )
117 )
118 (intermediate_query): Blip2QFormerIntermediate(
119 (dense): Linear(in_features=768, out_features=3072, bias=True)
120 (intermediate_act_fn): GELUActivation()
121 )
122 (output_query): Blip2QFormerOutput(
123 (dense): Linear(in_features=3072, out_features=768, bias=True)
124 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
125 (dropout): Dropout(p=0.1, inplace=False)
126 )
127 )
128 (3): Blip2QFormerLayer(
129 (attention): Blip2QFormerAttention(
130 (attention): Blip2QFormerMultiHeadAttention(
131 (query): Linear(in_features=768, out_features=768, bias=True)
132 (key): Linear(in_features=768, out_features=768, bias=True)
133 (value): Linear(in_features=768, out_features=768, bias=True)
134 (dropout): Dropout(p=0.1, inplace=False)
135 )
136 (output): Blip2QFormerSelfOutput(
137 (dense): Linear(in_features=768, out_features=768, bias=True)
138 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
139 (dropout): Dropout(p=0.1, inplace=False)
140 )
141 )
142 (intermediate_query): Blip2QFormerIntermediate(
143 (dense): Linear(in_features=768, out_features=3072, bias=True)
144 (intermediate_act_fn): GELUActivation()
145 )
146 (output_query): Blip2QFormerOutput(
147 (dense): Linear(in_features=3072, out_features=768, bias=True)
148 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
149 (dropout): Dropout(p=0.1, inplace=False)
150 )
151 )
152 (4): Blip2QFormerLayer(
153 (attention): Blip2QFormerAttention(
154 (attention): Blip2QFormerMultiHeadAttention(
155 (query): Linear(in_features=768, out_features=768, bias=True)
156 (key): Linear(in_features=768, out_features=768, bias=True)
157 (value): Linear(in_features=768, out_features=768, bias=True)
158 (dropout): Dropout(p=0.1, inplace=False)
159 )
160 (output): Blip2QFormerSelfOutput(
161 (dense): Linear(in_features=768, out_features=768, bias=True)
162 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
163 (dropout): Dropout(p=0.1, inplace=False)
164 )
165 )
166 (crossattention): Blip2QFormerAttention(
167 (attention): Blip2QFormerMultiHeadAttention(
168 (query): Linear(in_features=768, out_features=768, bias=True)
169 (key): Linear(in_features=1408, out_features=768, bias=True)
170 (value): Linear(in_features=1408, out_features=768, bias=True)
171 (dropout): Dropout(p=0.1, inplace=False)
172 )
173 (output): Blip2QFormerSelfOutput(
174 (dense): Linear(in_features=768, out_features=768, bias=True)
175 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
176 (dropout): Dropout(p=0.1, inplace=False)
177 )
178 )
179 (intermediate_query): Blip2QFormerIntermediate(
180 (dense): Linear(in_features=768, out_features=3072, bias=True)
181 (intermediate_act_fn): GELUActivation()
182 )
183 (output_query): Blip2QFormerOutput(
184 (dense): Linear(in_features=3072, out_features=768, bias=True)
185 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
186 (dropout): Dropout(p=0.1, inplace=False)
187 )
188 )
189 (5): Blip2QFormerLayer(
190 (attention): Blip2QFormerAttention(
191 (attention): Blip2QFormerMultiHeadAttention(
192 (query): Linear(in_features=768, out_features=768, bias=True)
193 (key): Linear(in_features=768, out_features=768, bias=True)
194 (value): Linear(in_features=768, out_features=768, bias=True)
195 (dropout): Dropout(p=0.1, inplace=False)
196 )
197 (output): Blip2QFormerSelfOutput(
198 (dense): Linear(in_features=768, out_features=768, bias=True)
199 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
200 (dropout): Dropout(p=0.1, inplace=False)
201 )
202 )
203 (intermediate_query): Blip2QFormerIntermediate(
204 (dense): Linear(in_features=768, out_features=3072, bias=True)
205 (intermediate_act_fn): GELUActivation()
206 )
207 (output_query): Blip2QFormerOutput(
208 (dense): Linear(in_features=3072, out_features=768, bias=True)
209 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
210 (dropout): Dropout(p=0.1, inplace=False)
211 )
212 )
213 (6): Blip2QFormerLayer(
214 (attention): Blip2QFormerAttention(
215 (attention): Blip2QFormerMultiHeadAttention(
216 (query): Linear(in_features=768, out_features=768, bias=True)
217 (key): Linear(in_features=768, out_features=768, bias=True)
218 (value): Linear(in_features=768, out_features=768, bias=True)
219 (dropout): Dropout(p=0.1, inplace=False)
220 )
221 (output): Blip2QFormerSelfOutput(
222 (dense): Linear(in_features=768, out_features=768, bias=True)
223 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
224 (dropout): Dropout(p=0.1, inplace=False)
225 )
226 )
227 (crossattention): Blip2QFormerAttention(
228 (attention): Blip2QFormerMultiHeadAttention(
229 (query): Linear(in_features=768, out_features=768, bias=True)
230 (key): Linear(in_features=1408, out_features=768, bias=True)
231 (value): Linear(in_features=1408, out_features=768, bias=True)
232 (dropout): Dropout(p=0.1, inplace=False)
233 )
234 (output): Blip2QFormerSelfOutput(
235 (dense): Linear(in_features=768, out_features=768, bias=True)
236 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
237 (dropout): Dropout(p=0.1, inplace=False)
238 )
239 )
240 (intermediate_query): Blip2QFormerIntermediate(
241 (dense): Linear(in_features=768, out_features=3072, bias=True)
242 (intermediate_act_fn): GELUActivation()
243 )
244 (output_query): Blip2QFormerOutput(
245 (dense): Linear(in_features=3072, out_features=768, bias=True)
246 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
247 (dropout): Dropout(p=0.1, inplace=False)
248 )
249 )
250 (7): Blip2QFormerLayer(
251 (attention): Blip2QFormerAttention(
252 (attention): Blip2QFormerMultiHeadAttention(
253 (query): Linear(in_features=768, out_features=768, bias=True)
254 (key): Linear(in_features=768, out_features=768, bias=True)
255 (value): Linear(in_features=768, out_features=768, bias=True)
256 (dropout): Dropout(p=0.1, inplace=False)
257 )
258 (output): Blip2QFormerSelfOutput(
259 (dense): Linear(in_features=768, out_features=768, bias=True)
260 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
261 (dropout): Dropout(p=0.1, inplace=False)
262 )
263 )
264 (intermediate_query): Blip2QFormerIntermediate(
265 (dense): Linear(in_features=768, out_features=3072, bias=True)
266 (intermediate_act_fn): GELUActivation()
267 )
268 (output_query): Blip2QFormerOutput(
269 (dense): Linear(in_features=3072, out_features=768, bias=True)
270 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
271 (dropout): Dropout(p=0.1, inplace=False)
272 )
273 )
274 (8): Blip2QFormerLayer(
275 (attention): Blip2QFormerAttention(
276 (attention): Blip2QFormerMultiHeadAttention(
277 (query): Linear(in_features=768, out_features=768, bias=True)
278 (key): Linear(in_features=768, out_features=768, bias=True)
279 (value): Linear(in_features=768, out_features=768, bias=True)
280 (dropout): Dropout(p=0.1, inplace=False)
281 )
282 (output): Blip2QFormerSelfOutput(
283 (dense): Linear(in_features=768, out_features=768, bias=True)
284 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
285 (dropout): Dropout(p=0.1, inplace=False)
286 )
287 )
288 (crossattention): Blip2QFormerAttention(
289 (attention): Blip2QFormerMultiHeadAttention(
290 (query): Linear(in_features=768, out_features=768, bias=True)
291 (key): Linear(in_features=1408, out_features=768, bias=True)
292 (value): Linear(in_features=1408, out_features=768, bias=True)
293 (dropout): Dropout(p=0.1, inplace=False)
294 )
295 (output): Blip2QFormerSelfOutput(
296 (dense): Linear(in_features=768, out_features=768, bias=True)
297 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
298 (dropout): Dropout(p=0.1, inplace=False)
299 )
300 )
301 (intermediate_query): Blip2QFormerIntermediate(
302 (dense): Linear(in_features=768, out_features=3072, bias=True)
303 (intermediate_act_fn): GELUActivation()
304 )
305 (output_query): Blip2QFormerOutput(
306 (dense): Linear(in_features=3072, out_features=768, bias=True)
307 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
308 (dropout): Dropout(p=0.1, inplace=False)
309 )
310 )
311 (9): Blip2QFormerLayer(
312 (attention): Blip2QFormerAttention(
313 (attention): Blip2QFormerMultiHeadAttention(
314 (query): Linear(in_features=768, out_features=768, bias=True)
315 (key): Linear(in_features=768, out_features=768, bias=True)
316 (value): Linear(in_features=768, out_features=768, bias=True)
317 (dropout): Dropout(p=0.1, inplace=False)
318 )
319 (output): Blip2QFormerSelfOutput(
320 (dense): Linear(in_features=768, out_features=768, bias=True)
321 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
322 (dropout): Dropout(p=0.1, inplace=False)
323 )
324 )
325 (intermediate_query): Blip2QFormerIntermediate(
326 (dense): Linear(in_features=768, out_features=3072, bias=True)
327 (intermediate_act_fn): GELUActivation()
328 )
329 (output_query): Blip2QFormerOutput(
330 (dense): Linear(in_features=3072, out_features=768, bias=True)
331 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
332 (dropout): Dropout(p=0.1, inplace=False)
333 )
334 )
335 (10): Blip2QFormerLayer(
336 (attention): Blip2QFormerAttention(
337 (attention): Blip2QFormerMultiHeadAttention(
338 (query): Linear(in_features=768, out_features=768, bias=True)
339 (key): Linear(in_features=768, out_features=768, bias=True)
340 (value): Linear(in_features=768, out_features=768, bias=True)
341 (dropout): Dropout(p=0.1, inplace=False)
342 )
343 (output): Blip2QFormerSelfOutput(
344 (dense): Linear(in_features=768, out_features=768, bias=True)
345 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
346 (dropout): Dropout(p=0.1, inplace=False)
347 )
348 )
349 (crossattention): Blip2QFormerAttention(
350 (attention): Blip2QFormerMultiHeadAttention(
351 (query): Linear(in_features=768, out_features=768, bias=True)
352 (key): Linear(in_features=1408, out_features=768, bias=True)
353 (value): Linear(in_features=1408, out_features=768, bias=True)
354 (dropout): Dropout(p=0.1, inplace=False)
355 )
356 (output): Blip2QFormerSelfOutput(
357 (dense): Linear(in_features=768, out_features=768, bias=True)
358 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
359 (dropout): Dropout(p=0.1, inplace=False)
360 )
361 )
362 (intermediate_query): Blip2QFormerIntermediate(
363 (dense): Linear(in_features=768, out_features=3072, bias=True)
364 (intermediate_act_fn): GELUActivation()
365 )
366 (output_query): Blip2QFormerOutput(
367 (dense): Linear(in_features=3072, out_features=768, bias=True)
368 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
369 (dropout): Dropout(p=0.1, inplace=False)
370 )
371 )
372 (11): Blip2QFormerLayer(
373 (attention): Blip2QFormerAttention(
374 (attention): Blip2QFormerMultiHeadAttention(
375 (query): Linear(in_features=768, out_features=768, bias=True)
376 (key): Linear(in_features=768, out_features=768, bias=True)
377 (value): Linear(in_features=768, out_features=768, bias=True)
378 (dropout): Dropout(p=0.1, inplace=False)
379 )
380 (output): Blip2QFormerSelfOutput(
381 (dense): Linear(in_features=768, out_features=768, bias=True)
382 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
383 (dropout): Dropout(p=0.1, inplace=False)
384 )
385 )
386 (intermediate_query): Blip2QFormerIntermediate(
387 (dense): Linear(in_features=768, out_features=3072, bias=True)
388 (intermediate_act_fn): GELUActivation()
389 )
390 (output_query): Blip2QFormerOutput(
391 (dense): Linear(in_features=3072, out_features=768, bias=True)
392 (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
393 (dropout): Dropout(p=0.1, inplace=False)
394 )
395 )
396 )
397 )
398 )
399 (language_projection): Linear(in_features=768, out_features=2560, bias=True)
400 (language_model): OPTForCausalLM(
401 (model): OPTModel(
402 (decoder): OPTDecoder(
403 (embed_tokens): Embedding(50304, 2560, padding_idx=1)
404 (embed_positions): OPTLearnedPositionalEmbedding(2050, 2560)
405 (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
406 (layers): ModuleList(
407 (0-31): 32 x OPTDecoderLayer(
408 (self_attn): OPTAttention(
409 (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
410 (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
411 (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
412 (out_proj): Linear(in_features=2560, out_features=2560, bias=True)
413 )
414 (activation_fn): ReLU()
415 (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
416 (fc1): Linear(in_features=2560, out_features=10240, bias=True)
417 (fc2): Linear(in_features=10240, out_features=2560, bias=True)
418 (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
419 )
420 )
421 )
422 )
423 (lm_head): Linear(in_features=2560, out_features=50304, bias=False)
424 )
425)