SDN VLAN VNets no longer work after an update from 9.1.9 to 9.2.2

Cookiefamily

Renowned Member
Jan 29, 2020
153
42
68
Germany
I was using a VLAN Zone with two VNets and it seemed to work well.

Yesterday evening I updated our staging environment to 9.2.2 and since then I can no longer get connectivity through those VNets - when I use the vmbr0 bridge directly and tag it on the VMs network interface, it works fine.

This is the config:
Code:
root@node01:~# cat /etc/network/interfaces
# network interface settings; autogenerated
# Please do NOT modify this file directly, unless you know what
# you're doing.
#
# If you want to manage parts of the network configuration manually,
# please utilize the 'source' or 'source-directory' directives to do
# so.
# PVE will preserve these directives, but will NOT read its network
# configuration from sourced files, so do not attempt to move any of
# the PVE managed interfaces into external files!

auto lo
iface lo inet loopback

auto nic3
iface nic3 inet manual
#mgmt-B

auto nic8
iface nic8 inet manual
#frontend-A

auto nic9
iface nic9 inet manual
#mgmt-A

auto nic10
iface nic10 inet static
    address 172.16.91.11/24
#corosync-A ring0

auto nic0
iface nic0 inet manual
#ceph-A

auto nic1
iface nic1 inet manual
#ceph-B

auto nic2
iface nic2 inet manual
#frontend-B

auto nic4
iface nic4 inet static
    address 172.16.93.11/24
#corosync-B ring1

auto nic5
iface nic5 inet static
    address 10.10.10.141/25
    mtu 9000
#iscsi-B-1

iface nic6 inet manual

auto nic7
iface nic7 inet manual

iface idrac inet manual

auto nic11
iface nic11 inet static
    address 10.10.10.11/25
    mtu 9000
#iscsi-A-0

auto bond0
iface bond0 inet static
    address 172.16.90.11/24
    gateway 172.16.90.1
    bond-slaves nic3 nic9
    bond-miimon 100
    bond-mode active-backup
    bond-primary nic3
#Management

auto bond1
iface bond1 inet manual
    ovs_bonds nic2 nic8
    ovs_type OVSBond
    ovs_bridge vmbr0
    ovs_options lacp=active bond_mode=balance-tcp
#LACP Bond for Frontend

auto bond2
iface bond2 inet static
    address 172.16.92.11/24
    bond-slaves nic0 nic1
    bond-miimon 100
    bond-mode 802.3ad
    bond-xmit-hash-policy layer2+3
    mtu 9000
#LACP Bond for Ceph

auto bond0.60
iface bond0.60 inet static
    address 172.16.71.21/24
#For ESXi Migrations

auto vmbr0
iface vmbr0 inet manual
    ovs_type OVSBridge
    ovs_ports bond1
#Openvswitch Frontend

source /etc/network/interfaces.d/*

Code:
root@node01:~# cat /etc/network/interfaces.d/sdn
#version:16

auto ln_v0074
iface ln_v0074
    ovs_type OVSIntPort
    ovs_bridge vmbr0
    ovs_mtu 1500
    ovs_options tag=74

auto ln_v0083
iface ln_v0083
    ovs_type OVSIntPort
    ovs_bridge vmbr0
    ovs_mtu 1500
    ovs_options tag=83

auto ln_v0086
iface ln_v0086
    ovs_type OVSIntPort
    ovs_bridge vmbr0
    ovs_mtu 1500
    ovs_options tag=86

auto ln_v0087
iface ln_v0087
    ovs_type OVSIntPort
    ovs_bridge vmbr0
    ovs_mtu 1500
    ovs_options tag=87

auto ln_v0089
iface ln_v0089
    ovs_type OVSIntPort
    ovs_bridge vmbr0
    ovs_mtu 1500
    ovs_options tag=89

auto v0074
iface v0074
    bridge_ports ln_v0074
    bridge_stp off
    bridge_fd 0
    mtu 1500
    alias VLAN 74

auto v0083
iface v0083
    bridge_ports ln_v0083
    bridge_stp off
    bridge_fd 0
    mtu 1500
    alias VLAN 83

auto v0086
iface v0086
    bridge_ports ln_v0086
    bridge_stp off
    bridge_fd 0
    mtu 1500
    alias VLAN 86

auto v0087
iface v0087
    bridge_ports ln_v0087
    bridge_stp off
    bridge_fd 0
    mtu 1500
    alias VLAN 87

auto v0089
iface v0089
    bridge_ports ln_v0089
    bridge_stp off
    bridge_fd 0
    mtu 1500
    alias VLAN 89

auto vmbr0
iface vmbr0
    ovs_ports ln_v0074
    ovs_ports ln_v0083
    ovs_ports ln_v0086
    ovs_ports ln_v0087
    ovs_ports ln_v0089

Config of a VM that worked fine previously:
Code:
agent: 1
boot: order=scsi0;ide2;net0
cores: 4
cpu: x86-64-v2-AES
ide2: shared_iso:iso/proxmox-datacenter-manager_1.0-2.iso,media=cdrom,size=1445316K
memory: 4096
meta: creation-qemu=10.1.2,ctime=1778599956
name: pdm02
net0: virtio=BC:24:11:B8:B0:3A,bridge=v0074,mtu=1500
numa: 0
ostype: l26
scsi0: ceph01_vm:vm-131-disk-0,cache=writeback,discard=on,iothread=1,size=50G,ssd=1
scsihw: virtio-scsi-single
smbios1: uuid=9c774ce7-60cb-4555-b6cc-824d4cd5fa0f
sockets: 1
vmgenid: cbcf7176-5b08-470b-8270-65a4d77d5d9b

I already tried downgrading to an older Kernel since this was also the upgrade to Kernel 7.0 for me, that did not help.
Code:
# pveversion --verbose
proxmox-ve: 9.2.0 (running kernel: 7.0.2-6-pve)
pve-manager: 9.2.2 (running version: 9.2.2/b9984c6d90a4bd80)
proxmox-kernel-helper: 9.2.0
proxmox-kernel-7.0: 7.0.2-6
proxmox-kernel-7.0.2-6-pve-signed: 7.0.2-6
proxmox-kernel-6.17: 6.17.13-11
proxmox-kernel-6.17.13-11-pve-signed: 6.17.13-11
proxmox-kernel-6.17.13-2-pve-signed: 6.17.13-2
proxmox-kernel-6.17.2-1-pve-signed: 6.17.2-1
amd64-microcode: 3.20251202.1~bpo13+1
ceph: 19.2.3-pve4
ceph-fuse: 19.2.3-pve4
corosync: 3.1.10-pve2
criu: 4.1.1-1
frr-pythontools: 10.6.1-1+pve2
ifupdown2: 3.3.0-1+pmx12
intel-microcode: 3.20251111.1~deb13u1
ksm-control-daemon: 1.5-1
libjs-extjs: 7.0.0-5
libproxmox-acme-perl: 1.7.1
libproxmox-backup-qemu0: 2.0.2
libproxmox-rs-perl: 0.4.1
libpve-access-control: 9.1.1
libpve-apiclient-perl: 3.4.2
libpve-cluster-api-perl: 9.1.5
libpve-cluster-perl: 9.1.5
libpve-common-perl: 9.1.12
libpve-guest-common-perl: 6.0.3
libpve-http-server-perl: 6.0.5
libpve-network-perl: 1.6.6
libpve-notify-perl: 9.1.5
libpve-rs-perl: 0.15.3
libpve-storage-perl: 9.1.5
libspice-server1: 0.15.2-1+b1
lvm2: 2.03.31-2+pmx1
lxc-pve: 7.0.0-2
lxcfs: 7.0.0-pve1
novnc-pve: 1.7.0-1
openvswitch-switch: 3.5.0-1+b1
proxmox-backup-client: 4.2.0-1
proxmox-backup-file-restore: 4.2.0-1
proxmox-backup-restore-image: 1.0.0
proxmox-firewall: 1.2.3
proxmox-kernel-helper: 9.2.0
proxmox-mail-forward: 1.0.3
proxmox-mini-journalreader: 1.6
proxmox-offline-mirror-helper: 0.7.4
proxmox-widget-toolkit: 5.2.2
pve-cluster: 9.1.5
pve-container: 6.1.10
pve-docs: 9.2.1
pve-edk2-firmware: 4.2025.05-2
pve-esxi-import-tools: 1.0.1
pve-firewall: 6.0.4
pve-firmware: 3.18-3
pve-ha-manager: 5.2.4
pve-i18n: 3.7.4
pve-qemu-kvm: 11.0.0-3
pve-xtermjs: 6.0.0-1
qemu-server: 9.1.15
smartmontools: 7.5-pve2
spiceterm: 3.4.2
swtpm: 0.8.0+pve3
vncterm: 1.9.2
zfsutils-linux: 2.4.2-pve1

Does someone have an idea what happened here?
 
Can you post the output of the following commands?

Code:
ip a

Can you try reloading the network configuration via the following command and paste the full output?

Code:
ifreload -avd

Does anything odd pop up in the journal?

Code:
journalctl -u openvswitch-switch.service -b
 
Hi Stefan,

I attached the ip a and ifreload output as it is too verbose for a normal post.

The journal looks like this:
Code:
root@node02:~# journalctl -u openvswitch-switch.service -b
May 21 18:03:33 node02 systemd[1]: Starting openvswitch-switch.service - Open vSwitch...
May 21 18:03:33 node02 systemd[1]: Finished openvswitch-switch.service - Open vSwitch.
 

Attachments

Hey,

thanks for the output! I could not really find anything suspicious in the output, could you run tcpdump on the interfaces, then try ping:
- tap131i0
- ln_v0074
- bond1
(same on the receiving side, tap.. changes to the one of the target guest)

how far do the ICMP packets get? So, on which interfaces do they show up?
Also, could you try disabling lacp on the band, and changing the bond_mode. Anything different with that?
 
Hey,

thanks for the output! I could not really find anything suspicious in the output, could you run tcpdump on the interfaces, then try ping:
- tap131i0
- ln_v0074
- bond1
(same on the receiving side, tap.. changes to the one of the target guest)

how far do the ICMP packets get? So, on which interfaces do they show up?
Also, could you try disabling lacp on the band, and changing the bond_mode. Anything different with that?
Hi, sorry for the late reply, I was on an two week vacation.

testing from inside the VM trying to ping its gateway:
tap131i0 sees it
v0074 sees it
ln_v0074 doesn't see it
vmbr0 doesn't see it
bond1 doesn't see it

trying to ping another VM that doesn't use the SDN VLAN but just vmbr0 bridge with tag 74 on the VM interface behaves the same, also don't see any output on the receiving sides tap interface.

switching bond1 to active-backup didn't change anything.
 
Hey,

thanks for the info! I tried to reproduce the problem you are describing, but could not. Could you try setting up some dummy guests and rebuilding your network setup(still dummy, don't re-build the actual thing :) ) step-by-step, and narrowing down where things start to not work? This should also result in a smaller setup that produces the problem, and make it easier to debug.
 
Last edited:
I needed to set up a new cluster anyway for some testing and I managed to reproduce the issue. This time I used normal Linux Bridges from the start and not OVS Bridges, same result. This is the network config:

Code:
# network interface settings; autogenerated
# Please do NOT modify this file directly, unless you know what
# you're doing.
#
# If you want to manage parts of the network configuration manually,
# please utilize the 'source' or 'source-directory' directives to do
# so.
# PVE will preserve these directives, but will NOT read its network
# configuration from sourced files, so do not attempt to move any of
# the PVE managed interfaces into external files!

auto lo
iface lo inet loopback

auto nic0
iface nic0 inet manual
#Management A

auto nic2
iface nic2 inet static
    address 172.16.91.21/24
#Corosync A

auto nic3
iface nic3 inet static
    address 172.16.93.21/24
#Corosync B

auto nic1
iface nic1 inet manual
#Management B

auto nic9
iface nic9 inet static
    address 10.10.10.151/25
    mtu 9000
#iSCSI B

auto nic8
iface nic8 inet static
    address 10.10.10.21/25
    mtu 9000
#iSCSI A

auto nic7
iface nic7 inet manual
#Ceph B

auto nic6
iface nic6 inet manual
#Ceph A

auto nic5
iface nic5 inet manual
#Frontend B

auto nic4
iface nic4 inet manual
#Frontend A

auto bond0
iface bond0 inet static
    address 172.16.90.21/24
    gateway 172.16.90.1
    bond-slaves nic0 nic1
    bond-miimon 100
    bond-mode active-backup
    bond-primary nic0
#Management Bond

auto bond1
iface bond1 inet manual
    bond-slaves nic4 nic5
    bond-miimon 100
    bond-mode active-backup
#Frontend Bond

auto bond2
iface bond2 inet static
    address 172.16.92.21/24
    bond-slaves nic6 nic7
    bond-miimon 100
    bond-mode 802.3ad
    bond-xmit-hash-policy layer2+3
    mtu 9000
#Ceph LACP Bond

auto vmbr0
iface vmbr0 inet manual
    bridge-ports bond1
    bridge-stp off
    bridge-fd 0
    bridge-vlan-aware yes
    bridge-vids 2-4094

source /etc/network/interfaces.d/*

And here the SDN config:
Code:
root@node01:~# cat /etc/pve/sdn/vnets.cfg
vnet: v0074
    zone zone1
    tag 74

root@node01:~# cat /etc/pve/sdn/zones.cfg
vlan: zone1
    bridge vmbr0
    ipam pve

root@node01:~# cat /etc/pve/sdn/.running-config
{"zones":{"ids":{"zone1":{"ipam":"pve","bridge":"vmbr0","type":"vlan"}}},"subnets":{"ids":{}},"vnets":{"ids":{"v0074":{"tag":74,"zone":"zone1","type":"vnet"}}},"fabrics":{"ids":{}},"controllers":{"ids":{}},"version":2,"prefix-lists":{"ids":{}},"route-maps":{"ids":{}}}root@node01:~#

I migrated a VM to the new cluster. When I attach it to v0074 the VM looses connectivity, when I attach it to vmbr0 with vlan tag 74 in the network interface of the VM it works.
 
Can you additionally add the output of the following commands?

Code:
cat /etc/network/interfaces.d/sdn
ip a
 
sure, here is the output:
Code:
root@node01:~# cat /etc/network/interfaces.d/sdn
#version:2

auto v0074
iface v0074
    bridge_ports vmbr0.74
    bridge_stp off
    bridge_fd 0
root@node01:~# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host noprefixroute 
       valid_lft forever preferred_lft forever
2: nic5: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond1 state UP group default qlen 1000
    link/ether 3c:fd:fe:b5:03:71 brd ff:ff:ff:ff:ff:ff permaddr 3c:fd:fe:b5:03:70
    altname enp24s0f0np0
    altname enx3cfdfeb50370
3: nic4: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond1 state UP group default qlen 1000
    link/ether 3c:fd:fe:b5:03:71 brd ff:ff:ff:ff:ff:ff
    altname enp24s0f1np1
    altname enx3cfdfeb50371
4: nic2: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
    link/ether 24:6e:96:76:23:20 brd ff:ff:ff:ff:ff:ff
    altname enp26s0f0np0
    altname enx246e96762320
    inet 172.16.91.21/24 scope global nic2
       valid_lft forever preferred_lft forever
    inet6 fe80::266e:96ff:fe76:2320/64 scope link proto kernel_ll 
       valid_lft forever preferred_lft forever
5: nic3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
    link/ether 24:6e:96:76:23:22 brd ff:ff:ff:ff:ff:ff
    altname enp26s0f1np1
    altname enx246e96762322
    inet 172.16.93.21/24 scope global nic3
       valid_lft forever preferred_lft forever
    inet6 fe80::266e:96ff:fe76:2322/64 scope link proto kernel_ll 
       valid_lft forever preferred_lft forever
6: nic0: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond0 state UP group default qlen 1000
    link/ether 24:6e:96:76:23:24 brd ff:ff:ff:ff:ff:ff
    altname enp26s0f2np2
    altname enx246e96762324
7: nic1: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc mq master bond0 state UP group default qlen 1000
    link/ether 24:6e:96:76:23:24 brd ff:ff:ff:ff:ff:ff permaddr 24:6e:96:76:23:26
    altname enp26s0f3np3
    altname enx246e96762326
8: nic7: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 9000 qdisc mq master bond2 state UP group default qlen 1000
    link/ether 40:a6:b7:0d:65:10 brd ff:ff:ff:ff:ff:ff
    altname enp134s0f0np0
    altname enx40a6b70d6510
9: nic6: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 9000 qdisc mq master bond2 state UP group default qlen 1000
    link/ether 40:a6:b7:0d:65:10 brd ff:ff:ff:ff:ff:ff permaddr 40:a6:b7:0d:65:11
    altname enp134s0f1np1
    altname enx40a6b70d6511
10: nic9: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 9000 qdisc mq state UP group default qlen 1000
    link/ether 40:a6:b7:0e:30:c0 brd ff:ff:ff:ff:ff:ff
    altname enp175s0f0np0
    altname enx40a6b70e30c0
    inet 10.10.10.151/25 scope global nic9
       valid_lft forever preferred_lft forever
    inet6 fe80::42a6:b7ff:fe0e:30c0/64 scope link proto kernel_ll 
       valid_lft forever preferred_lft forever
11: nic8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 9000 qdisc mq state UP group default qlen 1000
    link/ether 40:a6:b7:0e:30:c1 brd ff:ff:ff:ff:ff:ff
    altname enp175s0f1np1
    altname enx40a6b70e30c1
    inet 10.10.10.21/25 scope global nic8
       valid_lft forever preferred_lft forever
    inet6 fe80::42a6:b7ff:fe0e:30c1/64 scope link proto kernel_ll 
       valid_lft forever preferred_lft forever
12: dtl0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
    link/ether ae:35:48:fe:92:54 brd ff:ff:ff:ff:ff:ff
    inet 169.254.1.1/32 scope link dtl0
       valid_lft forever preferred_lft forever
    inet6 fe80::ac35:48ff:fefe:9254/64 scope link proto kernel_ll 
       valid_lft forever preferred_lft forever
13: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 24:6e:96:76:23:24 brd ff:ff:ff:ff:ff:ff
    inet 172.16.90.21/24 scope global bond0
       valid_lft forever preferred_lft forever
    inet6 fe80::266e:96ff:fe76:2324/64 scope link proto kernel_ll 
       valid_lft forever preferred_lft forever
14: bond2: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 9000 qdisc noqueue state UP group default qlen 1000
    link/ether 40:a6:b7:0d:65:10 brd ff:ff:ff:ff:ff:ff
    inet 172.16.92.21/24 scope global bond2
       valid_lft forever preferred_lft forever
    inet6 fe80::42a6:b7ff:fe0d:6510/64 scope link proto kernel_ll 
       valid_lft forever preferred_lft forever
15: bond1: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue master vmbr0 state UP group default qlen 1000
    link/ether 3c:fd:fe:b5:03:71 brd ff:ff:ff:ff:ff:ff
16: vmbr0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 3c:fd:fe:b5:03:71 brd ff:ff:ff:ff:ff:ff
    inet6 fe80::3efd:feff:feb5:371/64 scope link proto kernel_ll 
       valid_lft forever preferred_lft forever
17: vmbr0.74@vmbr0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master v0074 state UP group default qlen 1000
    link/ether 3c:fd:fe:b5:03:71 brd ff:ff:ff:ff:ff:ff
18: v0074: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 3c:fd:fe:b5:03:71 brd ff:ff:ff:ff:ff:ff
    inet6 fe80::3efd:feff:feb5:371/64 scope link proto kernel_ll 
       valid_lft forever preferred_lft forever
20: docker0: <NO-CARRIER,BROADCAST,MULTICAST,UP> mtu 1500 qdisc noqueue state DOWN group default 
    link/ether 8a:7f:ce:34:0f:85 brd ff:ff:ff:ff:ff:ff
    inet 10.208.0.1/24 brd 10.208.0.255 scope global docker0
       valid_lft forever preferred_lft forever
22: tap1000i0: <BROADCAST,MULTICAST,PROMISC,UP,LOWER_UP> mtu 1500 qdisc fq_codel master v0074 state UNKNOWN group default qlen 1000
    link/ether 72:9c:95:ba:4a:fa brd ff:ff:ff:ff:ff:ff
 
Does it work if you remove docker from the host?
It is known to interfere with networking on a PVE host, so I suspect that it might have something to do with the firewall rules it creates on the host.
 
Does it work if you remove docker from the host?
It is known to interfere with networking on a PVE host, so I suspect that it might have something to do with the firewall rules it creates on the host.
Yup that did the trick... wow o_O Thats... mildly inconvenient.
We usually need docker as some monitoring checks run inside a docker container. At least we now know where the issue lies, not sure yet how to fix that...