网卡将所有数据包都给内核处理吗(在自制操作系统上写网卡驱动)
为什么要在自制操作系统上写网卡驱动?请看这里:
如何在自制操作系统写网卡驱动程序(1)
那么今天,我们就开始第一步:看看其他操作系统上的网卡驱动是如何写的?
先看下linux操作系统中,是如何和网卡通信的。
在硬件加电初始化时,BIOS统一检查所有的pci设备,并为每个设备分配一个物理地址,该地址通过BIOS获得并写到设备的配置空间内,驱动程序就可以将网卡的普通控制寄存器映射到一段内存空间内,CPU通过访问映射后的虚拟地址来操控网卡的寄存器。
当操作系统初始化时,其为每个PCI设备分配一个pci_dev结构,并将前面分配的物理地址写到PCI_dev的resource字段中。
在网卡驱动程序中则可以通过读取pci_dev中的resource字段获得网卡的寄存器配置空间地址,其由函数pci_resource_start()和pci_resource_end()获得该空间的起始位置,通过IOremap()将该段位置映射到主存中,以便CPU访问控制网卡的I/O和内存空间。
调用pci_resource_start
ioremap( pci_resource_start(pdev, BAR_1), pci_resource_len(pdev, BAR_1) );
pci_resource_start只是一个宏定义:
/*
* These helpers provide future and backwards compatibility
* for accessing popular PCI BAR info
*/
// 开始地址
#define pci_resource_start(dev, bar) ((dev)->resource[(bar)].start)
// 结束地址
#define pci_resource_end(dev, bar) ((dev)->resource[(bar)].end)
// 设置标志
#define pci_resource_flags(dev, bar) ((dev)->resource[(bar)].flags)
// 获取长度:(start==0 && start==end)?0:end-start 1;
// 这句话是简写,展开后:
// if(start==0 && start==end){ return 0;}else{ return end-start 1;}
#define pci_resource_len(dev,bar) \
((pci_resource_start((dev), (bar)) == 0 && \
pci_resource_end((dev), (bar)) == \
pci_resource_start((dev), (bar))) ? 0 : \
\
(pci_resource_end((dev), (bar)) - \
pci_resource_start((dev), (bar)) 1))
它定义了对resouce结构体列表中的resource的start,end字段赋值的动作
struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions expansion ROMs */
/*
* Resources are tree-like, allowing
* nesting etc..
*/
struct resource {
resource_size_t start;
resource_size_t end;
const char *name;
unsigned long flags;
unsigned long desc;
struct resource *parent, *sibling, *child;
};
在linux使用e1000网卡时,初始化的函数为e1000_probe.
这个函数执行完,网卡以及相关协议的结构体都完成配置,中断函数也完成了绑定,操作系统就可以收到网卡的中断信息了,所以这个函数里的代码时可以参考的。
/**
* e1000_probe - Device initialization Routine
* @pdev: PCI device information struct
* @ent: entry in e1000_pci_tbl
*
* Returns 0 on success, negative on failure
*
* e1000_probe initializes an adapter identified by a pci_dev structure.
* The OS initialization, configuring of the adapter private structure,
* and a hardware reset occur.
**/
static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
struct net_device *netdev;
struct e1000_adapter *adapter = NULL;
struct e1000_hw *hw;
static int cards_found;
static int global_quad_port_a; /* global ksp3 port a indication */
int i, err, pci_using_dac;
u16 eeprom_data = 0;
u16 tmp = 0;
u16 eeprom_apme_mask = E1000_EEPROM_APME;
int bars, need_ioport;
bool disable_dev = false;
/* do not allocate ioport bars when not needed */
need_ioport = e1000_is_need_ioport(pdev);
if (need_ioport) {
bars = pci_select_bars(pdev, IORESOURCE_MEM | IORESOURCE_IO);
err = pci_enable_device(pdev);
} else {
bars = pci_select_bars(pdev, IORESOURCE_MEM);
err = pci_enable_device_mem(pdev);
}
if (err)
return err;
err = pci_request_selected_regions(pdev, bars, e1000_driver_name);
if (err)
goto err_pci_reg;
pci_set_master(pdev);
err = pci_save_state(pdev);
if (err)
goto err_alloc_etherdev;
err = -ENOMEM;
netdev = alloc_etherdev(sizeof(struct e1000_adapter));
if (!netdev)
goto err_alloc_etherdev;
SET_NETDEV_DEV(netdev, &pdev->dev);
pci_set_drvdata(pdev, netdev);
adapter = netdev_priv(netdev);
adapter->netdev = netdev;
adapter->pdev = pdev;
adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);
adapter->bars = bars;
adapter->need_ioport = need_ioport;
hw = &adapter->hw;
hw->back = adapter;
err = -EIO;
hw->hw_addr = pci_ioremap_bar(pdev, BAR_0);
if (!hw->hw_addr)
goto err_ioremap;
if (adapter->need_ioport) {
for (i = BAR_1; i < PCI_STD_NUM_BARS; i ) {
if (pci_resource_len(pdev, i) == 0)
continue;
if (pci_resource_flags(pdev, i) & IORESOURCE_IO) {
hw->io_base = pci_resource_start(pdev, i);
break;
}
}
}
/* make ready for any if (hw->...) below */
err = e1000_init_hw_struct(adapter, hw);
if (err)
goto err_sw_init;
/* there is a workaround being applied below that limits
* 64-bit DMA addresses to 64-bit hardware. There are some
* 32-bit adapters that Tx hang when given 64-bit DMA addresses
*/
pci_using_dac = 0;
if ((hw->bus_type == e1000_bus_type_pcix) &&
!dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) {
pci_using_dac = 1;
} else {
err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
if (err) {
pr_err("No usable DMA config, aborting\n");
goto err_dma;
}
}
netdev->netdev_ops = &e1000_netdev_ops;
e1000_set_ethtool_ops(netdev);
netdev->watchdog_timeo = 5 * HZ;
netif_napi_add(netdev, &adapter->napi, e1000_clean, 64);
strncpy(netdev->name, pci_name(pdev), sizeof(netdev->name) - 1);
adapter->bd_number = cards_found;
/* setup the private structure */
err = e1000_sw_init(adapter);
if (err)
goto err_sw_init;
err = -EIO;
if (hw->MAC_type == e1000_ce4100) {
hw->ce4100_gbe_mdio_base_virt =
ioremap(pci_resource_start(pdev, BAR_1),
pci_resource_len(pdev, BAR_1));
if (!hw->ce4100_gbe_mdio_base_virt)
goto err_mdio_ioremap;
}
if (hw->mac_type >= e1000_82543) {
netdev->hw_features = NETIF_F_SG |
NETIF_F_HW_CSUM |
NETIF_F_HW_VLAN_CTAG_RX;
netdev->features = NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_CTAG_FILTER;
}
if ((hw->mac_type >= e1000_82544) &&
(hw->mac_type != e1000_82547))
netdev->hw_features |= NETIF_F_TSO;
netdev->priv_flags |= IFF_SUPP_NOFCS;
netdev->features |= netdev->hw_features;
netdev->hw_features |= (NETIF_F_RXCSUM |
NETIF_F_RXALL |
NETIF_F_RXFCS);
if (pci_using_dac) {
netdev->features |= NETIF_F_HIGHDMA;
netdev->vlan_features |= NETIF_F_HIGHDMA;
}
netdev->vlan_features |= (NETIF_F_TSO |
NETIF_F_HW_CSUM |
NETIF_F_SG);
/* Do not set IFF_UNICAST_FLT for VMWare's 82545EM */
if (hw->device_id != E1000_DEV_ID_82545EM_COPPER ||
hw->subsystem_vendor_id != PCI_VENDOR_ID_VMWARE)
netdev->priv_flags |= IFF_UNICAST_FLT;
/* MTU range: 46 - 16110 */
netdev->min_mtu = ETH_ZLEN - ETH_HLEN;
netdev->max_mtu = MAX_JUMBO_FRAME_SIZE - (ETH_HLEN ETH_FCS_LEN);
adapter->en_mng_pt = e1000_enable_mng_pass_thru(hw);
/* initialize eeprom parameters */
if (e1000_init_eeprom_params(hw)) {
e_err(probe, "EEPROM initialization failed\n");
goto err_eeprom;
}
/* before reading the EEPROM, reset the controller to
* put the device in a known good starting state
*/
e1000_reset_hw(hw);
/* make sure the EEPROM is good */
if (e1000_validate_eeprom_checksum(hw) < 0) {
e_err(probe, "The EEPROM Checksum Is Not Valid\n");
e1000_dump_eeprom(adapter);
/* set MAC address to all zeroes to invalidate and temporary
* disable this device for the user. This blocks regular
* traffic while still permitting ethtool ioctls from reaching
* the hardware as well as allowing the user to run the
* interface after manually setting a hw addr using
* `ip set address`
*/
memset(hw->mac_addr, 0, netdev->addr_len);
} else {
/* copy the MAC address out of the EEPROM */
if (e1000_read_mac_addr(hw))
e_err(probe, "EEPROM Read Error\n");
}
/* don't block initialization here due to bad MAC address */
memcpy(netdev->dev_addr, hw->mac_addr, netdev->addr_len);
if (!is_valid_ether_addr(netdev->dev_addr))
e_err(probe, "Invalid MAC Address\n");
INIT_DELAYED_WORK(&adapter->watchdog_task, e1000_watchdog);
INIT_DELAYED_WORK(&adapter->fifo_stall_task,
e1000_82547_tx_fifo_stall_task);
INIT_DELAYED_WORK(&adapter->phy_info_task, e1000_update_phy_info_task);
INIT_WORK(&adapter->reset_task, e1000_reset_task);
e1000_check_options(adapter);
/* Initial Wake on LAN setting
* If APM wake is enabled in the EEPROM,
* enable the ACPI Magic Packet filter
*/
switch (hw->mac_type) {
case e1000_82542_rev2_0:
case e1000_82542_rev2_1:
case e1000_82543:
break;
case e1000_82544:
e1000_read_eeprom(hw,
EEPROM_INIT_CONTROL2_REG, 1, &eeprom_data);
eeprom_apme_mask = E1000_EEPROM_82544_APM;
break;
case e1000_82546:
case e1000_82546_rev_3:
if (er32(STATUS) & E1000_STATUS_FUNC_1) {
e1000_read_eeprom(hw,
EEPROM_INIT_CONTROL3_PORT_B, 1, &eeprom_data);
break;
}
/* Fall Through */
default:
e1000_read_eeprom(hw,
EEPROM_INIT_CONTROL3_PORT_A, 1, &eeprom_data);
break;
}
if (eeprom_data & eeprom_apme_mask)
adapter->eeprom_wol |= E1000_WUFC_MAG;
/* now that we have the eeprom settings, apply the special cases
* where the eeprom may be wrong or the board simply won't support
* wake on lan on a particular port
*/
switch (pdev->device) {
case E1000_DEV_ID_82546GB_PCIE:
adapter->eeprom_wol = 0;
break;
case E1000_DEV_ID_82546EB_FIBER:
case E1000_DEV_ID_82546GB_FIBER:
/* Wake events only supported on port A for dual fiber
* regardless of eeprom setting
*/
if (er32(STATUS) & E1000_STATUS_FUNC_1)
adapter->eeprom_wol = 0;
break;
case E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3:
/* if quad port adapter, disable WoL on all but port A */
if (global_quad_port_a != 0)
adapter->eeprom_wol = 0;
else
adapter->quad_port_a = true;
/* Reset for multiple quad port adapters */
if ( global_quad_port_a == 4)
global_quad_port_a = 0;
break;
}
/* initialize the wol settings based on the eeprom settings */
adapter->wol = adapter->eeprom_wol;
device_set_wakeup_enable(&adapter->pdev->dev, adapter->wol);
/* Auto detect PHY address */
if (hw->mac_type == e1000_ce4100) {
for (i = 0; i < 32; i ) {
hw->phy_addr = i;
e1000_read_phy_reg(hw, PHY_ID2, &tmp);
if (tmp != 0 && tmp != 0xFF)
break;
}
if (i >= 32)
goto err_eeprom;
}
/* reset the hardware with the new settings */
e1000_reset(adapter);
strcpy(netdev->name, "eth%d");
err = register_netdev(netdev);
if (err)
goto err_register;
e1000_vlan_filter_on_off(adapter, false);
/* print bus type/speed/width info */
e_info(probe, "(PCI%s:%dMHz:%d-bit) %pM\n",
((hw->bus_type == e1000_bus_type_pcix) ? "-X" : ""),
((hw->bus_speed == e1000_bus_speed_133) ? 133 :
(hw->bus_speed == e1000_bus_speed_120) ? 120 :
(hw->bus_speed == e1000_bus_speed_100) ? 100 :
(hw->bus_speed == e1000_bus_speed_66) ? 66 : 33),
((hw->bus_width == e1000_bus_width_64) ? 64 : 32),
netdev->dev_addr);
/* carrier off reporting is important to ethtool even BEFORE open */
netif_carrier_off(netdev);
e_info(probe, "Intel(R) PRO/1000 Network Connection\n");
cards_found ;
return 0;
err_register:
err_eeprom:
e1000_phy_hw_reset(hw);
if (hw->flash_address)
iounmap(hw->flash_address);
kfree(adapter->tx_ring);
kfree(adapter->rx_ring);
err_dma:
err_sw_init:
err_mdio_ioremap:
iounmap(hw->ce4100_gbe_mdio_base_virt);
iounmap(hw->hw_addr);
err_ioremap:
disable_dev = !test_and_set_bit(__E1000_DISABLED, &adapter->flags);
free_netdev(netdev);
err_alloc_etherdev:
pci_release_selected_regions(pdev, bars);
err_pci_reg:
if (!adapter || disable_dev)
pci_disable_device(pdev);
return err;
}
其中,函数e1000_is_need_ioport,表示当前网卡是否要通过ip口配置:
/**
* e1000_is_need_ioport - determine if an adapter needs ioport resources or not
* @pdev: PCI device information struct
*
* Return true if an adapter needs ioport resources
**/
static int e1000_is_need_ioport(struct pci_dev *pdev)
{
switch (pdev->device) {
case E1000_DEV_ID_82540EM:
case E1000_DEV_ID_82540EM_LOM:
case E1000_DEV_ID_82540EP:
case E1000_DEV_ID_82540EP_LOM:
case E1000_DEV_ID_82540EP_LP:
case E1000_DEV_ID_82541EI:
case E1000_DEV_ID_82541EI_MOBILE:
case E1000_DEV_ID_82541ER:
case E1000_DEV_ID_82541ER_LOM:
case E1000_DEV_ID_82541GI:
case E1000_DEV_ID_82541GI_LF:
case E1000_DEV_ID_82541GI_MOBILE:
case E1000_DEV_ID_82544EI_COPPER:
case E1000_DEV_ID_82544EI_FIBER:
case E1000_DEV_ID_82544GC_COPPER:
case E1000_DEV_ID_82544GC_LOM:
case E1000_DEV_ID_82545EM_COPPER:
case E1000_DEV_ID_82545EM_FIBER:
case E1000_DEV_ID_82546EB_COPPER:
case E1000_DEV_ID_82546EB_FIBER:
case E1000_DEV_ID_82546EB_QUAD_COPPER:
return true;
default:
return false;
}
}
/**
* 内存映射相关:想尝试追踪到底层汇编的代码上
* pci_select_bars - Make BAR mask from the type of resource
* @dev: the PCI device for which BAR mask is made
* @flags: resource type mask to be selected
*
* This helper routine makes bar mask from the type of resource.
*/
int pci_select_bars(struct pci_dev *dev, unsigned long flags)
{
int i, bars = 0;
for (i = 0; i < PCI_NUM_RESOURCES; i )
if (pci_resource_flags(dev, i) & flags)
bars |= (1 << i);
return bars;
}
EXPORT_SYMBOL(pci_select_bars);
具体这里的EXPORT_SYMBOL是啥?
#define EXPORT_SYMBOL(sym) _EXPORT_SYMBOL(sym, "")
#define __EXPORT_SYMBOL(sym, sec, ns) ___EXPORT_SYMBOL(sym, sec, ns)
#endif /* CONFIG_MODULES */
#ifdef DEFAULT_SYMBOL_NAMESPACE
#include <linux/stringify.h>
#define _EXPORT_SYMBOL(sym, sec) __EXPORT_SYMBOL(sym, sec, __stringify(DEFAULT_SYMBOL_NAMESPACE))
#else
#define _EXPORT_SYMBOL(sym, sec) __EXPORT_SYMBOL(sym, sec, "")
#endif
#define EXPORT_SYMBOL(sym) _EXPORT_SYMBOL(sym, "")
#define EXPORT_SYMBOL_GPL(sym) _EXPORT_SYMBOL(sym, "_gpl")
#define EXPORT_SYMBOL_GPL_FUTURE(sym) _EXPORT_SYMBOL(sym, "_gpl_future")
#define EXPORT_SYMBOL_NS(sym, ns) __EXPORT_SYMBOL(sym, "", #ns)
#define EXPORT_SYMBOL_NS_GPL(sym, ns) __EXPORT_SYMBOL(sym, "_gpl", #ns)
#ifdef CONFIG_UNUSED_SYMBOLS
#define EXPORT_UNUSED_SYMBOL(sym) _EXPORT_SYMBOL(sym, "_unused")
#define EXPORT_UNUSED_SYMBOL_GPL(sym) _EXPORT_SYMBOL(sym, "_unused_gpl")
/*
* For every exported symbol, do the following:
*
* - If applicable, place a CRC entry in the __kcrctab section.
* - Put the name of the symbol and namespace (empty string "" for none) in
* __ksymtab_strings.
* - Place a struct kernel_symbol entry in the __ksymtab section.
*
* note on .section use: we specify progbits since usage of the "M" (SHF_MERGE)
* section flag requires it. Use '%progbits' instead of '@progbits' since the
* former apparently works on all arches according to the binutils source.
*/
#define ___EXPORT_SYMBOL(sym, sec, ns) \
extern typeof(sym) sym; \
extern const char __kstrtab_##sym[]; \
extern const char __kstrtabns_##sym[]; \
__CRC_SYMBOL(sym, sec); \
asm(" .section \"__ksymtab_strings\",\"aMS\",%progbits,1 \n" \
"__kstrtab_" #sym ": \n" \
" .asciz \"" #sym "\" \n" \
"__kstrtabns_" #sym ": \n" \
" .asciz \"" ns "\" \n" \
" .previous \n"); \
__KSYMTAB_ENTRY(sym, sec)
#define __CRC_SYMBOL(sym, sec) \
asm(" .section \"___kcrctab" sec " " #sym "\", \"a\" \n" \
" .weak __crc_" #sym " \n" \
" .long __crc_" #sym " - . \n" \
" .previous \n")
#else
#define __CRC_SYMBOL(sym, sec) \
asm(" .section \"___kcrctab" sec " " #sym "\", \"a\" \n" \
" .weak __crc_" #sym " \n" \
" .long __crc_" #sym " \n" \
" .previous \n")
#endif
/*
* Emit the ksymtab entry as a pair of relative references: this reduces
* the size by half on 64-bit architectures, and eliminates the need for
* absolute relocations that require runtime processing on relocatable
* kernels.
*/
#define __KSYMTAB_ENTRY(sym, sec) \
__ADDRESSABLE(sym) \
asm(" .section \"___ksymtab" sec " " #sym "\", \"a\" \n" \
" .balign 4 \n" \
"__ksymtab_" #sym ": \n" \
" .long " #sym "- . \n" \
" .long __kstrtab_" #sym "- . \n" \
" .long __kstrtabns_" #sym "- . \n" \
" .previous \n")
原来是汇编。
其中,对pci设备初始化的函数也很重要:
函数 pci_enable_device(pdev)
/**
* pci_enable_device - Initialize device before it's used by a driver.
* @dev: PCI device to be initialized
*
* Initialize device before it's used by a driver. Ask low-level code
* to enable I/O and memory. Wake up the device if it was suspended.
* Beware, this function can fail.
*
* Note we don't actually enable the device many times if we call
* this function repeatedly (we just increment the count).
*/
#define IORESOURCE_IO 0x00000100 /* PCI/ISA I/O ports */
#define IORESOURCE_MEM 0x00000200
int pci_enable_device(struct pci_dev *dev)
{
return pci_enable_device_flags(dev, IORESOURCE_MEM | IORESOURCE_IO);
}
EXPORT_SYMBOL(pci_enable_device);
static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
{
struct pci_dev *bridge;
int err;
int i, bars = 0;
/*
* Power state could be unknown at this point, either due to a fresh
* boot or a device removal call. So get the current power state
* so that things like MSI message writing will behave as expected
* (e.g. if the device really is in D0 at enable time).
*/
if (dev->pm_cap) {
u16 pmcsr;
pci_read_config_word(dev, dev->pm_cap PCI_PM_CTRL, &pmcsr);
dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
}
if (atomic_inc_return(&dev->enable_cnt) > 1)
return 0; /* already enabled */
bridge = pci_upstream_bridge(dev);
if (bridge)
pci_enable_bridge(bridge);
/* only skip sriov related */
for (i = 0; i <= PCI_ROM_RESOURCE; i )
if (dev->resource[i].flags & flags)
bars |= (1 << i);
for (i = PCI_BRIDGE_RESOURCES; i < DEVICE_COUNT_RESOURCE; i )
if (dev->resource[i].flags & flags)
bars |= (1 << i);
err = do_pci_enable_device(dev, bars);
if (err < 0)
atomic_dec(&dev->enable_cnt);
return err;
}
最终的执行函数为do_pci_enable_device
#define PCI_D0 ((pci_power_t __force) 0)
#define PCI_INTERRUPT_PIN 0x3d /* 8 bits */
#define PCI_COMMAND 0x04 /* 16 bits */
#define PCI_COMMAND_INTX_DISABLE 0x400 /* INTx Emulation Disable */
static int do_pci_enable_device(struct pci_dev *dev, int bars)
{
int err;
struct pci_dev *bridge;
u16 cmd;
u8 pin;
err = pci_set_power_state(dev, PCI_D0);
if (err < 0 && err != -EIO)
return err;
bridge = pci_upstream_bridge(dev);
if (bridge)
pcie_aspm_powersave_config_link(bridge);
err = pcibios_enable_device(dev, bars);
if (err < 0)
return err;
pci_fixup_device(pci_fixup_enable, dev);
if (dev->msi_enabled || dev->msix_enabled)
return 0;
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (pin) {
pci_read_config_word(dev, PCI_COMMAND, &cmd);
if (cmd & PCI_COMMAND_INTX_DISABLE)
pci_write_config_word(dev, PCI_COMMAND,
cmd & ~PCI_COMMAND_INTX_DISABLE);
}
return 0;
}
int __weak pcibios_enable_device(struct pci_dev *dev, int bars)
{
return pci_enable_resources(dev, bars);
}
int pci_enable_resources(struct pci_dev *dev, int mask)
{
u16 cmd, old_cmd;
int i;
struct resource *r;
// 重点解读函数
pci_read_config_word(dev, PCI_COMMAND, &cmd);
old_cmd = cmd;
for (i = 0; i < PCI_NUM_RESOURCES; i ) {
if (!(mask & (1 << i)))
continue;
r = &dev->resource[i];
if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
continue;
if ((i == PCI_ROM_RESOURCE) &&
(!(r->flags & IORESOURCE_ROM_ENABLE)))
continue;
if (r->flags & IORESOURCE_UNSET) {
pci_err(dev, "can't enable device: BAR %d %pR not assigned\n",
i, r);
return -EINVAL;
}
if (!r->parent) {
pci_err(dev, "can't enable device: BAR %d %pR not claimed\n",
i, r);
return -EINVAL;
}
if (r->flags & IORESOURCE_IO)
cmd |= PCI_COMMAND_IO;
if (r->flags & IORESOURCE_MEM)
cmd |= PCI_COMMAND_MEMORY;
}
if (cmd != old_cmd) {
pci_info(dev, "enabling device (x -> x)\n", old_cmd, cmd);
//重点解读函数
pci_write_config_word(dev, PCI_COMMAND, cmd);
}
return 0;
}
int pci_read_config_word(const struct pci_dev *dev, int where, u16 *val)
{
if (pci_dev_is_disconnected(dev)) {
*val = ~0;
return PCIBIOS_DEVICE_NOT_FOUND;
}
return pci_bus_read_config_word(dev->bus, dev->devfn, where, val);
}
顺着代码往深了找,希望能找到读写配置文件的汇编代码
#define PCI_byte_BAD 0
#define PCI_word_BAD (pos & 1)
#define PCI_dword_BAD (pos & 3)
#ifdef CONFIG_PCI_LOCKLESS_CONFIG
# define pci_lock_config(f) do { (void)(f); } while (0)
# define pci_unlock_config(f) do { (void)(f); } while (0)
#else
# define pci_lock_config(f) raw_spin_lock_irqsave(&pci_lock, f)
# define pci_unlock_config(f) raw_spin_unlock_irqrestore(&pci_lock, f)
#endif
#define PCI_OP_READ(size, type, len) \
int noinline pci_bus_read_config_##size \
(struct pci_bus *bus, unsigned int devfn, int pos, type *value) \
{ \
int res; \
unsigned long flags; \
u32 data = 0; \
if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \
pci_lock_config(flags); \
res = bus->ops->read(bus, devfn, pos, len, &data); \
*value = (type)data; \
pci_unlock_config(flags); \
return res; \
}
#define PCI_OP_WRITE(size, type, len) \
int noinline pci_bus_write_config_##size \
(struct pci_bus *bus, unsigned int devfn, int pos, type value) \
{ \
int res; \
unsigned long flags; \
if (PCI_##size##_BAD) return PCIBIOS_BAD_REGISTER_NUMBER; \
pci_lock_config(flags); \
res = bus->ops->write(bus, devfn, pos, len, value); \
pci_unlock_config(flags); \
return res; \
}
PCI_OP_READ(byte, u8, 1)
PCI_OP_READ(word, u16, 2)
PCI_OP_READ(dword, u32, 4)
PCI_OP_WRITE(byte, u8, 1)
PCI_OP_WRITE(word, u16, 2)
PCI_OP_WRITE(dword, u32, 4)
EXPORT_SYMBOL(pci_bus_read_config_byte);
EXPORT_SYMBOL(pci_bus_read_config_word);
EXPORT_SYMBOL(pci_bus_read_config_dword);
EXPORT_SYMBOL(pci_bus_write_config_byte);
EXPORT_SYMBOL(pci_bus_write_config_word);
EXPORT_SYMBOL(pci_bus_write_config_dword);
es = bus->ops->write(bus, devfn, pos, len, value);
bus->ops->write
这个write是不是就是我要找的往IO端口写操作?
/* Low-level architecture-dependent routines */
struct pci_ops {
int (*add_bus)(struct pci_bus *bus);
void (*remove_bus)(struct pci_bus *bus);
void __iomem *(*map_bus)(struct pci_bus *bus, unsigned int devfn, int where);
int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val);
int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val);
};
pci_ops里都是函数指针了
这种函数指针不太好找源头了,只要把函数的地址给它,它就可以调用那个函数。
所以,要想知道它具体执行的是哪段代码,就不太好找了。我还没找到。
看来,直接读linux操作系统的源码,找到一些非常底层,非常细节的信息还是比较困难的。因为这里有大量的宏定义的使用,导致我们使用字符串查找去跟踪函数调用的方法失败了。
不过,从代码的上下文来猜测的话,应该是读PCI的配置信息的。
其实读PCI的配置信息可以直接从IO端口读。
我们可以从底层出发,直接自己写IO端口的读写程序,写完以后再尝试来参考linxu操作系统的相关代码。
直接从IO端口读在内核的:arch/x86/pci/early.c
文件内,有从IO端口读取PCI配置信息的代码
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/pci.h>
#include <asm/pci-direct.h>
#include <asm/io.h>
#include <asm/pci_x86.h>
/* Direct PCI access. This is used for PCI accesses in early boot before
the PCI subsystem works. */
u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
{
u32 v;
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
v = inl(0xcfc);
return v;
}
u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
{
u8 v;
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
v = inb(0xcfc (offset&3));
return v;
}
u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
{
u16 v;
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
v = inw(0xcfc (offset&2));
return v;
}
void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
u32 val)
{
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
outl(val, 0xcfc);
}
void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
{
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
outb(val, 0xcfc (offset&3));
}
void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
{
outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
outw(val, 0xcfc (offset&2));
}
int early_pci_allowed(void)
{
return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
PCI_PROBE_CONF1;
}
这个代码我们是可以移植到30天操作系统harios上的,把这个代码中的outl和inl换成harios内的io_out,io_in函数就可以了。
也就是说,我们可以按照以上代码读取,或者写入到配置信息了。
这里的配置信息到底是什么呢?,就是如下图中的4x16=64Bytes的信息
配置信息
参考:https://blog.csdn.net/qq_31799983/article/details/106976145?spm=1001.2101.3001.6650.8&utm_medium=distribute.pc_relevant.none-task-blog-2~default~BlogCommendFromBaidu~default-8-106976145-blog-80163665.pc_relevant_default&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2~default~BlogCommendFromBaidu~default-8-106976145-blog-80163665.pc_relevant_default&utm_relevant_index=11
通过读这个配置信息的class_code字段,我们就知道PCI连接的设备是网卡?还是显卡?还是硬盘?还是声卡?
这个配置信息的Device ID,Vendor ID,表明PCI连接的设备的型号和制造厂商.
这个配置信息的Base Address 0 里存储了PCI连接的设备的地址映射内存中的地址。
总之,只要读到这个配置信息,我们就可以找到网卡,找到网卡,才能控制网卡向外发送信息,接收信息等。
那么具体如何读取这个配置信息呢?通过I/O口0xCF8和0xCFC,通过这两个端口,就可以读取到配置信息了。如下两行代码:
io_out32(0xCF8, addr);// 把配置信息的地址addr输出到I/O端口0xCF8
indata = io_in32(0xCFC); //从I/O端口0xCFC获取到配置信息。
这里面的addr是什么?是配置信息所在的地址,可以这样生成:
unsigned int bus_max=0xff;
unsigned int dev_max=0x1f;
unsigned int func_max=0x07;
// 遍历配置信息
for(bus=0;bus<=bus_max; bus)
{
for(dev=0;dev<=dev_max; dev)
{
for(func=0;func<=func_max; func)
{
// 生成配置信息的地址
unsigned int addr = 0x80000000L | (bus<<16) | (dev<<11) | (func<<8) | (0<<2);
io_out32(0xCF8, addr);
indata = io_in32(0xCFC);
}
}
}
这里生成配置信息的地址:0x80000000 | (bus<16) | (dev<<11) | (func<<8) | (0<<2);
这是什么意思呢?
首先这是一个8*4=32bits的数,4个bytes,这4个bytes的意义:
最高位31位要设置成1,才能表示对PCI的配置信息操作。如果此位为0,表示要PCI所连接的设备的操作。
30--24是保留位,我们可以写入我们特定信息。
23--16位是总线号,15--11位是设备号,10--8位是功能号,7--2位是配置信息中某条信息的偏移地址。
所以,
0x80000000 | (bus<16) | (dev<<11) | (func<<8) | (offset<<2);
的意思是对配置信息中的总线号为bus,设备号为dev,功能号为func所指定的配置信息中的第offset个4字节的信息进行操作。
第0个4字节的信息,其实就是配置信息的第一行,Device ID 和Vendor ID.
那么第三行就是class code,第4行就是header type 了。
所以,我们要访问所有的配置信息,只用改变offset的值,使其为分别为0--15,就可以分别读取到所有的配置信息了。
那么地址中的bus,dev,func是什么意思呢?
跟CPU连接的PCI有很多,用bus, dev,func就把不同的PCI区分开了。那么与当前CPU连接的PCI的bus, dev,func分别是多少呢?
也就是说,bus, dev,func到底填多少合适?到底网卡连接的pci对应的bus,dev,func应该是多少呢?
可以写for循环去搜索,当bus,dev,func取到合适的值时,我们读取到的配置信息的class_code应该是02。
因为class_code是02时,表示PCI连接的设备是网卡。
所以,我们在以上代码中写了for循环,去遍历所有的bus,dev,func的值,看看哪些位置有pci,并且这个pci连接的是网卡。
那么如何知道哪些bus,dev,func的值对应的有pci? 只用看其对应的配置信息的第1行,如果第1行的DeviceID 和 Vendor ID不是0xFFFF,就说明有配置信息。
有配置信息,说明PCI连接的有设备,但不一定是网卡,还可能是显卡,硬盘等。
所以,我们还要进一步看其class_code的值,如果是02,就表示是网卡。
另外,遍历总线号bus时,由于bus的数字是存储于adder的23--16位的,一共8个bit位,所以,其取值范围为0--255,
遍历设备号dev时,由于其存储于addr的15--11位,共5位,所以,其取值范围为0--31.
遍历功能号func时,由于其存储于addr的10-8位,共3位,所以,其取值范围为0-7.
那么最终,我们的代码为:
void check_pci(unsigned char *buf_back,struct BOOTINFO *binfo)
{
char s[200];
unsigned int indata;
int bus,dev,func;
unsigned int bus_max=0xff;
unsigned int dev_max=0x1f;
unsigned int func_max=0x07;
// 设置打印信息的显示位置
int start_row=250;
int start_col=50;
int row_inc=0;
int i;
sprintf(s, "buf ,dev ,func ,vender_id ,device_id ,header_type,class_code ");
putfonts8_asc(buf_back, binfo->scrnx, start_col, start_row-1*16, COL8_FFFFFF, s);
for(bus=0;bus<=bus_max; bus)
{
for(dev=0;dev<=dev_max; dev)
{
for(func=0;func<=func_max; func)
{
unsigned int addr = 0x80000000L | (bus<<16) | (dev<<11) | (func<<8) | (0<<2);
io_out32(0xCF8, addr);
indata = io_in32(0xCFC);
// 查看当前bus,dev,func处有无pci
if( ((indata & 0xffff) != 0xffff) && (indata !=0))
{
//如果有,获取当前pci的第4行的header type
unsigned int addr1 = addr | (3<<2);
io_out32(0xCF8,addr1);
unsigned int header_type = (io_in32(0xCFC)&0x00ff0000)>>16;
//获取当前pci的第3行的class code ,这个可以看到设备是否是网卡
unsigned int addr2 = addr | (2<<2);
io_out32(0xCF8,addr2);
unsigned int class_code = (io_in32(0xCFC)&0xff000000)>>24;
// 显示pci的device id,vendor id,header type, class code
sprintf(s, "d ,d ,d ,0xx ,0xx ,0xx ,0xx ",bus,dev,func,indata&0xffff,(indata&0xffff0000)>>16,header_type,class_code);
putfonts8_asc(buf_back, binfo->scrnx, start_col, start_row row_inc*16, COL8_FFFFFF, s);
row_inc ;
}
}
}
}
return;
}
把函数check_pci添加到bootpack.c的for循环之前:
check_pci(buf_back,binfo);//打印pci信息
// 图层刷新
sheet_refresh(sht_back, 0, 0, sht_back->bxsize, sht_back->bysize);
显示效果如下
可以看到,在bus=0,dev=0,func=0时,对应的pci所连接的设备的vender_id是0x8086,表示是Inter的,device_id是1237,header_type是0,class_code是0x06,表明不是网卡,0x06具体表示什么?看下表:
表明它是桥设备。
根据这个表,咱们打印出的第3行是0x01, 表示海量存储器
根据这个表,咱们打印出的第4行是0x03, 表示网络控制器,即显卡
根据这个表,咱们打印出的第5行是0x02, 表示显示控制器,即网卡。
那么到这里,我们在bus号为0,dev号为3,func号为0的位置,找到了一张网卡。它的厂商是0x10ech,设备号是8029h
好了,到这里,今天我们通过I/O端口,读取到了网卡对应的PCI.
下一步就可以通过网卡对应的PCI来控制网卡收发信息。
附录:在30天操作系统上写网卡驱动需要从头开始,是比较琐碎的事,为此,我搜索了一定的源码和资料。
源码就是linux系统的各版本内核的源码,它带有很多网卡的驱动。
资料就是对pci,总线,以及内存映射,网络通信协议等资料。
在后续的驱动编写中,可能要反复的来复习这些资料。
比如:http://www.lab-z.com/2pciaccess/
这里面说了两种方法访问配置信息,第一种就是通过端口的0xCF8,0xCFC;第二种是通过内存映射。
比如直接访问如下的内存区域即可得到:
我还没有试。留着以后参考中。
还有一个细节,汇编对端口的读写,要特别注意。
比如我这里一开始端口的读写程序有bug,所以读取信息一直是错误的。
最后把 端口的读写程序从
_io_out32: ; void io_out32(int port, int data);
MOV EDX,[ESP 4] ; port
MOV EAX,[ESP 8] ; data
OUT EDX,EAX
RET
改为
_io_out32: ; void io_out32(int port, int data);
MOV DX,[ESP 4] ; port
MOV EAX,[ESP 8] ; data
OUT DX,EAX
RET
后,就正常了,
因为端口的地址0xCF8,不需要EDX,用EDX反而不能表示端口了,就无法给实现往端口上输出信息。
关于端口读写,还有用嵌入汇编的方式,比如:http://blog.chinaunix.net/uid-186409-id-2822610.html
另外,使用端口对PCI配置信息的读取有个例子不错,可以参考:
https://cloud.tencent.com/developer/article/1199972
这里写了一个window上的代码和一个linux上的代码。
比如我用linux上的代码后,输入的结果如下:
运行的时候,需要用sudo。因为这里设计到IO读取,所以需要root权限。
另外运行这个例子的时候,我就有个疑问:问什么这个例子明明是个应用程序,不是操作系统本身的代码,它却可以访问IO端口?
一般应用程序的代码,由于GDT的定义,应该是不能操作I/O端口的。
后来的代码里发现了
if ( iopl(3) < 0 )
{
printf("iopl set error\n");
return -1;
}
iopl函数用于获取io端口的访问权限,如果这个函数获取返回值大于0,就可以通过操作系统所提供的中断函数来对io端口进行访问了。
总之:应用程序要访问操作系统所占用的哪些资源,都得通过中断函数来访问。通过中断函数,我们就可以设计权限,加以控制,保证操作系统代码本身的安全性。
以下是一些样例:
在30天操作系统的代码上,添加读pci配置的程序check_pci的过程并不顺利,一开始写到屏幕上的信息无法刷新出来,就通过鼠标移动过去,把这些信息“擦”出来。当然后来发现使用刷新函数的时候,刷新错对象向,应该刷新sht_back,就过刷新sht_win了,sht_win表示的是tast_a窗口,如下图。
第一次成功
上图中显示了很多0xffff,这是因为程序什么也没有输出,所以我们调试的时候,就把所有的信息都输出的屏幕上了;我把没有cpi时的信息显示到左边,有cpi时的信息显示到了右边。
更改刷新函数后,可以正常刷新了
用MAC上的qemu打开了30天自制的操作系统,发现多了一个设备0x100e的设备
增加打印PCI所连接设备的header type和class_code信息
在MAC上
增加打印,在mac上的qemu上运行30天操作系统代码
可以看到,这里一共6个设备,比在windows上的qemu多了一个设备,多了一些PCI桥设备。网卡的型号变成100e了。
既然找到网卡了,下一步就是启动网卡,读取网卡的mac地址,命令网卡发送数据等了。
具体怎么实现呢?
1比如重启网卡设备,就是给网卡的某寄存器写入命令,用I/O方法,或者内存映射的方法。具体详细信息可以查看网卡信息的datasheet。
重启网卡设备,则是通过向映射后的网卡的相应寄存器写入命令,其通过映射后的首地址及相应的寄存器偏移量找到该寄存器的位置,然后通过函数writeb()写该寄存器。有关相关寄存器对应的偏移量,一般是通过网卡的相关的datasheet获得。
2.如果要获取网卡的MAC地址,则一般通过函数readb()读取首地址开始的前六位。
后面我们就获取到网卡的MAC地址,然后给网卡的寄存器里写值,来命令网卡收发信息。或者把网卡动作与操作系统的中断绑定起来。
,免责声明:本文仅代表文章作者的个人观点,与本站无关。其原创性、真实性以及文中陈述文字和内容未经本站证实,对本文以及其中全部或者部分内容文字的真实性、完整性和原创性本站不作任何保证或承诺,请读者仅作参考,并自行核实相关内容。文章投诉邮箱:anhduc.ph@yahoo.com