Spaces:

hehe9801209
/

wenchuang_new

Build error

wenchuang_new / lora_util.py

hehe

all

1fbfa6f 7 months ago

19.2 kB

	'''
	Borrowed and modified from sd-scripts, publicly available at
	https://github.com/kohya-ss/sd-scripts/blob/main/library/model_util.py
	'''

	from diffusers import UNet2DConditionModel

	# Model paras of stable diffusion in diffUsers
	NUM_TRAIN_TIMESTEPS = 1000
	BETA_START = 0.00085
	BETA_END = 0.0120

	UNET_PARAMS_MODEL_CHANNELS = 320
	UNET_PARAMS_CHANNEL_MULT = [1, 2, 4, 4]
	UNET_PARAMS_ATTENTION_RESOLUTIONS = [4, 2, 1]
	UNET_PARAMS_IMAGE_SIZE = 64 # fixed from old invalid value `32`
	UNET_PARAMS_IN_CHANNELS = 4
	UNET_PARAMS_OUT_CHANNELS = 4
	UNET_PARAMS_NUM_RES_BLOCKS = 2
	UNET_PARAMS_CONTEXT_DIM = 768
	UNET_PARAMS_NUM_HEADS = 8
	# UNET_PARAMS_USE_LINEAR_PROJECTION = False

	VAE_PARAMS_Z_CHANNELS = 4
	VAE_PARAMS_RESOLUTION = 256
	VAE_PARAMS_IN_CHANNELS = 3
	VAE_PARAMS_OUT_CH = 3
	VAE_PARAMS_CH = 128
	VAE_PARAMS_CH_MULT = [1, 2, 4, 4]
	VAE_PARAMS_NUM_RES_BLOCKS = 2

	# V2
	V2_UNET_PARAMS_ATTENTION_HEAD_DIM = [5, 10, 20, 20]
	V2_UNET_PARAMS_CONTEXT_DIM = 1024
	# V2_UNET_PARAMS_USE_LINEAR_PROJECTION = True


	def shave_segments(path, n_shave_prefix_segments=1):
	"""
	Removes segments. Positive values shave the first segments, negative shave the last segments.
	"""
	if n_shave_prefix_segments >= 0:
	return ".".join(path.split(".")[n_shave_prefix_segments:])
	else:
	return ".".join(path.split(".")[:n_shave_prefix_segments])


	def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
	"""
	Updates paths inside resnets to the new naming scheme (local renaming)
	"""
	mapping = []
	for old_item in old_list:
	new_item = old_item.replace("in_layers.0", "norm1")
	new_item = new_item.replace("in_layers.2", "conv1")

	new_item = new_item.replace("out_layers.0", "norm2")
	new_item = new_item.replace("out_layers.3", "conv2")

	new_item = new_item.replace("emb_layers.1", "time_emb_proj")
	new_item = new_item.replace("skip_connection", "conv_shortcut")

	new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)

	mapping.append({"old": old_item, "new": new_item})

	return mapping


	def renew_attention_paths(old_list, n_shave_prefix_segments=0):
	"""
	Updates paths inside attentions to the new naming scheme (local renaming)
	"""
	mapping = []
	for old_item in old_list:
	new_item = old_item
	mapping.append({"old": old_item, "new": new_item})

	return mapping


	def assign_to_checkpoint(
	paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
	):
	"""
	This does the final conversion step: take locally converted weights and apply a global renaming
	to them. It splits attention layers, and takes into account additional replacements
	that may arise.

	Assigns the weights to the new checkpoint.
	"""
	assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."

	# Splits the attention layers into three variables.
	if attention_paths_to_split is not None:
	for path, path_map in attention_paths_to_split.items():
	old_tensor = old_checkpoint[path]
	channels = old_tensor.shape[0] // 3

	target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)

	num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3

	old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
	query, key, value = old_tensor.split(channels // num_heads, dim=1)

	checkpoint[path_map["query"]] = query.reshape(target_shape)
	checkpoint[path_map["key"]] = key.reshape(target_shape)
	checkpoint[path_map["value"]] = value.reshape(target_shape)

	for path in paths:
	new_path = path["new"]

	# These have already been assigned
	if attention_paths_to_split is not None and new_path in attention_paths_to_split:
	continue

	# Global renaming happens here
	new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
	new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
	new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")

	if additional_replacements is not None:
	for replacement in additional_replacements:
	new_path = new_path.replace(replacement["old"], replacement["new"])

	# proj_attn.weight has to be converted from conv 1D to linear
	if "proj_attn.weight" in new_path:
	checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
	else:
	checkpoint[new_path] = old_checkpoint[path["old"]]


	def linear_transformer_to_conv(checkpoint):
	keys = list(checkpoint.keys())
	tf_keys = ["proj_in.weight", "proj_out.weight"]
	for key in keys:
	if ".".join(key.split(".")[-2:]) in tf_keys:
	if checkpoint[key].ndim == 2:
	checkpoint[key] = checkpoint[key].unsqueeze(2).unsqueeze(2)


	def conv_transformer_to_linear(checkpoint):
	keys = list(checkpoint.keys())
	tf_keys = ["proj_in.weight", "proj_out.weight"]
	for key in keys:
	if ".".join(key.split(".")[-2:]) in tf_keys:
	if checkpoint[key].ndim > 2:
	checkpoint[key] = checkpoint[key][:, :, 0, 0]


	def create_unet_diffusers_config(v2, use_linear_projection_in_v2=False):
	"""
	Creates a config for the diffusers based on the config of the LDM model.
	"""
	# unet_params = original_config.model.params.unet_config.params

	block_out_channels = [UNET_PARAMS_MODEL_CHANNELS * mult for mult in UNET_PARAMS_CHANNEL_MULT]

	down_block_types = []
	resolution = 1
	for i in range(len(block_out_channels)):
	block_type = "CrossAttnDownBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "DownBlock2D"
	down_block_types.append(block_type)
	if i != len(block_out_channels) - 1:
	resolution *= 2

	up_block_types = []
	for i in range(len(block_out_channels)):
	block_type = "CrossAttnUpBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "UpBlock2D"
	up_block_types.append(block_type)
	resolution //= 2

	config = dict(
	sample_size=UNET_PARAMS_IMAGE_SIZE,
	in_channels=UNET_PARAMS_IN_CHANNELS,
	out_channels=UNET_PARAMS_OUT_CHANNELS,
	down_block_types=tuple(down_block_types),
	up_block_types=tuple(up_block_types),
	block_out_channels=tuple(block_out_channels),
	layers_per_block=UNET_PARAMS_NUM_RES_BLOCKS,
	cross_attention_dim=UNET_PARAMS_CONTEXT_DIM if not v2 else V2_UNET_PARAMS_CONTEXT_DIM,
	attention_head_dim=UNET_PARAMS_NUM_HEADS if not v2 else V2_UNET_PARAMS_ATTENTION_HEAD_DIM,
	# use_linear_projection=UNET_PARAMS_USE_LINEAR_PROJECTION if not v2 else V2_UNET_PARAMS_USE_LINEAR_PROJECTION,
	)
	if v2 and use_linear_projection_in_v2:
	config["use_linear_projection"] = True

	return config


	def convert_ldm_unet_checkpoint(v2, checkpoint, config):
	"""
	Takes a state dict and a config, and returns a converted checkpoint.
	"""

	# extract state_dict for UNet
	unet_state_dict = {}
	unet_key = "model.diffusion_model."
	keys = list(checkpoint.keys())
	for key in keys:
	if key.startswith(unet_key):
	unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)

	new_checkpoint = {}

	new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
	new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
	new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
	new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]

	new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
	new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]

	new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
	new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
	new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
	new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]

	# Retrieves the keys for the input blocks only
	num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
	input_blocks = {
	layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key] for layer_id in range(num_input_blocks)
	}

	# Retrieves the keys for the middle blocks only
	num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
	middle_blocks = {
	layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}." in key] for layer_id in range(num_middle_blocks)
	}

	# Retrieves the keys for the output blocks only
	num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
	output_blocks = {
	layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key] for layer_id in range(num_output_blocks)
	}

	for i in range(1, num_input_blocks):
	block_id = (i - 1) // (config["layers_per_block"] + 1)
	layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)

	resnets = [key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key]
	attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]

	if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
	new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
	f"input_blocks.{i}.0.op.weight"
	)
	new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(f"input_blocks.{i}.0.op.bias")

	paths = renew_resnet_paths(resnets)
	meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
	assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)

	if len(attentions):
	paths = renew_attention_paths(attentions)
	meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
	assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)

	resnet_0 = middle_blocks[0]
	attentions = middle_blocks[1]
	resnet_1 = middle_blocks[2]

	resnet_0_paths = renew_resnet_paths(resnet_0)
	assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)

	resnet_1_paths = renew_resnet_paths(resnet_1)
	assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)

	attentions_paths = renew_attention_paths(attentions)
	meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
	assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)

	for i in range(num_output_blocks):
	block_id = i // (config["layers_per_block"] + 1)
	layer_in_block_id = i % (config["layers_per_block"] + 1)
	output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
	output_block_list = {}

	for layer in output_block_layers:
	layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
	if layer_id in output_block_list:
	output_block_list[layer_id].append(layer_name)
	else:
	output_block_list[layer_id] = [layer_name]

	if len(output_block_list) > 1:
	resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
	attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]

	resnet_0_paths = renew_resnet_paths(resnets)
	paths = renew_resnet_paths(resnets)

	meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
	assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)

	# オリジナル：
	# if ["conv.weight", "conv.bias"] in output_block_list.values():
	# index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])

	# biasとweightの順番に依存しないようにする：もっといいやり方がありそうだが
	for l in output_block_list.values():
	l.sort()

	if ["conv.bias", "conv.weight"] in output_block_list.values():
	index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
	new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
	f"output_blocks.{i}.{index}.conv.bias"
	]
	new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
	f"output_blocks.{i}.{index}.conv.weight"
	]

	# Clear attentions as they have been attributed above.
	if len(attentions) == 2:
	attentions = []

	if len(attentions):
	paths = renew_attention_paths(attentions)
	meta_path = {
	"old": f"output_blocks.{i}.1",
	"new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
	}
	assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
	else:
	resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
	for path in resnet_0_paths:
	old_path = ".".join(["output_blocks", str(i), path["old"]])
	new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])

	new_checkpoint[new_path] = unet_state_dict[old_path]

	# SDのv2では1*1のconv2dがlinearに変わっている
	# 誤って Diffusers 側を conv2d のままにしてしまったので、変換必要
	if v2 and not config.get('use_linear_projection', False):
	linear_transformer_to_conv(new_checkpoint)

	return new_checkpoint


	def convert_unet_state_dict_to_sd(unet_state_dict, v2=False):
	unet_conversion_map = [
	# (stable-diffusion, HF Diffusers)
	("time_embed.0.weight", "time_embedding.linear_1.weight"),
	("time_embed.0.bias", "time_embedding.linear_1.bias"),
	("time_embed.2.weight", "time_embedding.linear_2.weight"),
	("time_embed.2.bias", "time_embedding.linear_2.bias"),
	("input_blocks.0.0.weight", "conv_in.weight"),
	("input_blocks.0.0.bias", "conv_in.bias"),
	("out.0.weight", "conv_norm_out.weight"),
	("out.0.bias", "conv_norm_out.bias"),
	("out.2.weight", "conv_out.weight"),
	("out.2.bias", "conv_out.bias"),
	]

	unet_conversion_map_resnet = [
	# (stable-diffusion, HF Diffusers)
	("in_layers.0", "norm1"),
	("in_layers.2", "conv1"),
	("out_layers.0", "norm2"),
	("out_layers.3", "conv2"),
	("emb_layers.1", "time_emb_proj"),
	("skip_connection", "conv_shortcut"),
	]

	unet_conversion_map_layer = []
	for i in range(4):
	# loop over downblocks/upblocks

	for j in range(2):
	# loop over resnets/attentions for downblocks
	hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
	sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
	unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))

	if i < 3:
	# no attention layers in down_blocks.3
	hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
	sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
	unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))

	for j in range(3):
	# loop over resnets/attentions for upblocks
	hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
	sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
	unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))

	if i > 0:
	# no attention layers in up_blocks.0
	hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
	sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
	unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))

	if i < 3:
	# no downsample in down_blocks.3
	hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
	sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
	unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))

	# no upsample in up_blocks.3
	hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
	sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
	unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))

	hf_mid_atn_prefix = "mid_block.attentions.0."
	sd_mid_atn_prefix = "middle_block.1."
	unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))

	for j in range(2):
	hf_mid_res_prefix = f"mid_block.resnets.{j}."
	sd_mid_res_prefix = f"middle_block.{2*j}."
	unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))

	# buyer beware: this is a brittle function,
	# and correct output requires that all of these pieces interact in
	# the exact order in which I have arranged them.
	mapping = {k: k for k in unet_state_dict.keys()}
	for sd_name, hf_name in unet_conversion_map:
	mapping[hf_name] = sd_name
	for k, v in mapping.items():
	if "resnets" in k:
	for sd_part, hf_part in unet_conversion_map_resnet:
	v = v.replace(hf_part, sd_part)
	mapping[k] = v
	for k, v in mapping.items():
	for sd_part, hf_part in unet_conversion_map_layer:
	v = v.replace(hf_part, sd_part)
	mapping[k] = v
	new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}

	if v2:
	conv_transformer_to_linear(new_state_dict)

	return new_state_dict


	def get_diffusers_unet(unet=None, state_dict=None, v2=False):
	unet_config = create_unet_diffusers_config(v2, use_linear_projection_in_v2=False)
	if unet is None:
	unet = UNet2DConditionModel(**unet_config).to("cpu")
	if state_dict:
	converted_unet_checkpoint = convert_ldm_unet_checkpoint(v2, state_dict, unet_config)
	info = unet.load_state_dict(converted_unet_checkpoint)
	print("loading diffusers u-net:", info)
	return unet