How to write a Linux driver for the GPIO device on an OMAP-L138 board

“TI’s OMAP-L138 processor is a low-power applications processor based on an ARM926EJ-S and a C674x DSP core”. It provides significantly low power and has been widely used in the industry. In this article I will show you how to write a Linux driver for the GPIO device on a board based on this processor. We will use the AD5420 DAC (Digital to Analog) converter for this demonstration.

Before writing any code, let’s learn something about the hardware background. As we can see from the datasheet of AD5420, to make it work, all we need to do is to control the three special pins (suppose all pins have been connected correctly), labeled as LATCH, SCLK, and SDIN. As we all know, the GPIO peripheral provides general-purpose pins that can be configured as either inputs or outputs. The OMAP-L138 processor has up to 9 banks of GPIO pins, with each bank containing 16 pins. In this project, we used the 3nd bank of pins to connect with the DAC converter. They are GPIO2_10, GPIO2_11, and GPIO2_12 respectively.

First of all, we have to configure the PINMUX. The OMAP-L138 development kit has already provided an example. We just need to modify two files. In the file “/mach-davinci/include/mach/mux.h”, we will add DA850_GPIO2_10, DA850_GPIO2_11, and DA850_GPIO2_12 to the “enum davinci_da850_index” section. And in the file “arch/arm/mach-davinci/da850.c”, we add the following codes:

MUX_CFG(DA850, GPIO2_10, 5, 20, 15, 8, false) // connected with SCLK
MUX_CFG(DA850, GPIO2_11, 5, 16, 15, 8, false) // connected with LATCH
MUX_CFG(DA850, GPIO2_12, 5, 12, 15, 8, false) // connected with SDIN

The format of the MUX_CFG is (for a thorough understanding, we can check the processor’s technical reference manual):

DA850 -> SOC name
GPIO2_10 -> Pin desc
5 -> PINMUX5
20 -> Offset value of PINMUX5
15 -> Mask default value
8 -> Mode
false -> for debugging 

We also need to check that the GPIO driver is enabled in the Linux kernel. Then we rebuild the kernel.

Here we can write the actual device drive. A Linux drive consists of three main operations, there are “ioctl”, “read”, and “write”. And we also should implement the “init_module” and “cleanup_module” functions. And we’d better provide the statement of “MODULE_LICENSE”, ” MODULE_DESCRIPTION ” and “MODULE_AUTHOR” at the end of the source code. The following is the complete source code. Some comments have been added to make it readable.

[pre]



/* 
 *  Linux driver for AD5420
 * 
 *  12-Bit, Serial Input, 4 mA to 20 mA   
 * 
 *  The system contains 6 devices, so we will use the daisy-chain mode
 *
 */

#include 	<linux/module.h>
#include 	<linux/version.h>
#include 	<linux/delay.h>
#include 	<linux/irq.h>
#include 	<linux/interrupt.h>
#include 	<linux/completion.h>
#include    <mach/cputype.h>
#include    <mach/hardware.h>
#include    <mach/mux.h>
#include    <asm/gpio.h>
#include    <asm/uaccess.h>
#include 	<linux/fs.h>

#define  DEVICE_NAME      "/dev/myAD5420"   // the name of the device
#define  DEVICE_MAJOR     253    // major number of the device
#define  MAX_DEVICE_NUM   6      // 6 AD5420 chips in the daisy-chain mode
#define  SCLK_PIN         42     // connected to GPIO 2[10]
#define  LATCH_PIN        43     // connected to GPIO 2[11]
#define  SDIN_PIN         44     // connected to GPIO 2[12]
#define  CMD_BIT_NUM      8      // 8 bits
#define  DATA_BIT_NUM     16     // 16 bits    
#define  TOTAL_PIN_NUM    3      // 3 pins used

static int          GPIO_PIN_NUMS[TOTAL_PIN_NUM]  =   {42,43,44};
                                  // 2*16 + 10, 2*16 + 11, 2*16 + 12

struct  data_unit{
    unsigned char device_index;       // between [0,MAX_DEVICE_NUM-1]
    unsigned int  out_value;          // 12 bits valid
};

static  unsigned int  global_value_array[MAX_DEVICE_NUM];
static int ad5420_ioctl(struct inode *inode, struct file *file,
                        unsigned int command, unsigned long args);
static ssize_t ad5420_read(struct file *, char *, size_t, loff_t *);
static ssize_t ad5420_write(struct file *, char *, size_t, loff_t *);

static struct file_operations fops = {
    ioctl: ad5420_ioctl,
    read:  ad5420_read,
    write: ad5420_write
};

void   InitDevice(void);        
void   WriteUnit(unsigned int out_value);
void   ShiftCommand(unsigned char command);
void   ShiftData(unsigned int data);
void   OutPutData();

int init_module()
{
    int i,status,major;

    // configure the pins
    for(i = 0; i < TOTAL_PIN_NUM; ++i) {
        status = davinci_cfg_reg(GPIO_PIN_NUMS[i]);
        if (status < 0) {
            printk("Pin could not be muxed for GPIO functionality %d\n",
                       GPIO_PIN_NUMS[i]);  // in the driver we use printk
            return status;
        }
    }

    // request GPIO resource
    char gpio_pin_name[100];
    for(i = 0; i < TOTAL_PIN_NUM; ++i) {
        sprintf(gpio_pin_name, "gpio_%d", GPIO_PIN_NUMS[i]);
        status = gpio_request(GPIO_PIN_NUMS[i], gpio_pin_name);
        if (status < 0) {
            printk("Can not open GPIO %d\n", GPIO_PIN_NUMS[i]);
            return status;
        }
    }

    // set GPIO's input/output directions
    for(i = 0; i < TOTAL_PIN_NUM; ++i)
        gpio_direction_output(GPIO_PIN_NUMS[i] ,0);

    // set initial values
    for(i = 0; i < TOTAL_PIN_NUM; ++i)
        gpio_set_value(GPIO_PIN_NUMS[i] ,0);

    InitDevice();

    // register the device
    major = register_chrdev(DEVICE_MAJOR, DEVICE_NAME, &fops);
    if (major < 0) {
        printk ("Registering the character device failed with %d\n", major);
        return major;
    }

    // clean the global value array
    for(i = 0; i < TOTAL_PIN_NUM; ++i)
        global_value_array[i] = 0;

    return 0;
}

void cleanup_module(void)
{
    int i;

    // release GPIO resource
    for(i = 0; i < TOTAL_PIN_NUM; ++i)
        gpio_free(GPIO_PIN_NUMS[i]);

    // unregister the device
    unregister_chrdev(DEVICE_MAJOR, DEVICE_NAME); 
}

//  AD5420 can not read, so we let it empty
static ssize_t ad5420_read(struct file *filp,char *buf,
                           size_t count,loff_t *f_ops)
{       
         return 0; 
}

// output all values in the daisy-chain
static ssize_t ad5420_write(struct file *filp,char *buf,
                            size_t count,loff_t *f_ops)
{
    int i;
    unsigned int current_value;

    gpio_set_value(LATCH_PIN, 0);   
    gpio_set_value(SCLK_PIN, 0);   
    ndelay(50000);

    for(i = 0; i < MAX_DEVICE_NUM; ++i) {
        current_value = global_value_array[i];
        WriteUnit(current_value);
    }

    OutPutData();

    return 1;
}

static int ad5420_ioctl(struct inode *inode, struct file *file,
                  unsigned int command, unsigned long args) {
    int     device_index_;
    struct  data_unit  current_unit;

    if (copy_from_user(&current_unit,(const char *)args,sizeof(current_unit)) 
        != 0) {
        printk("copy_from_user failed \n");
        return 0;
    }

    device_index_ =  current_unit.device_index;
    if ((device_index_ < 0) || (device_index_ > (MAX_DEVICE_NUM - 1))) {
        printk("device index should between [0,5] \n");
        return 0;
    }

    global_value_array[device_index_] = current_unit.out_value;

    return 1;
}

// learn the write mode timing diagram from the datasheet
void ShiftCommand(unsigned char command) {
    int i;
    unsigned char command_;
    unsigned char command_flag;

    command_ = command;
    for(i = 0; i < CMD_BIT_NUM; ++i) {
        command_flag = command_ &  0x80;
        if (0 == command_flag)
            gpio_set_value(SDIN_PIN ,0);                 
        else
            gpio_set_value(SDIN_PIN ,1);                 
        ndelay(50000);

        gpio_set_value(SCLK_PIN ,1);    
        ndelay(50000);

        gpio_set_value(SCLK_PIN ,0);    
        ndelay(50000);

        command_   =   command_ << 1;
    }
}

void ShiftData(unsigned int data) {
    int i;
    unsigned int data_;
    unsigned int data_flag;

    data_ = data;
    for(i = 0; i < DATA_BIT_NUM; ++i) {
        data_flag = data_ & 0x8000;
        if (0 == data_flag)
            gpio_set_value(SDIN_PIN ,0);                 
        else
            gpio_set_value(SDIN_PIN ,1);                 
        ndelay(50000);

        gpio_set_value(SCLK_PIN ,1);    
        ndelay(50000);

        gpio_set_value(SCLK_PIN ,0);    
        ndelay(50000);

        data_ = data_ << 1;
    }
}

void OutPutData() {
    gpio_set_value(SDIN_PIN, 0);   
    gpio_set_value(SCLK_PIN ,0);   
    gpio_set_value(LATCH_PIN,1);   
    ndelay(50000);
    gpio_set_value(LATCH_PIN,0);   
}

void   InitDevice(void) {
    int i;
    unsigned char command;
    unsigned int  data;

    // reset, see the chip's datasheet
    for(i = 0; i < MAX_DEVICE_NUM; ++i) {
        gpio_set_value(LATCH_PIN,0);   
        gpio_set_value(SCLK_PIN ,0);   
        ndelay(50000);

        command = 0x56;   
        ShiftCommand(command);

        data = 0x0001;
        ShiftData(data);

        // nop
        command =  0x00; 
        ShiftCommand(command);

        data = 0x0000;  
        ShiftData(data);

        OutPutData();

        ndelay(1000000);
    }
}

void   WriteUnit(unsigned int out_value) {
    unsigned char command;
    command = 0x01;
    ShiftCommand(command);
    ShiftData(out_value);
}

MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("AD5420 driver");
MODULE_AUTHOR("hlding");

[/pre]
In order to use our device driver, we should run these two commands:


insmod AD5420.ko
mknod /dev/myAD5420 c 253 0

And here is our tiny test program:

[pre]

#include    <stdio.h>
#include    <fcntl.h>
#include    <unistd.h>
#include    <sys/ioctl.h>
#include 	<linux/types.h>

#define MAX_DEVICE_NUM 6 // 6 AD5420 chips in the daisy-chain mode
#define DEVICE_NAME "/dev/myAD5420" // the name of the device
#define SET_CHANNEL_DATA 1

struct data_unit{
    unsigned char device_index; // between [0,MAX_DEVICE_NUM-1]
    unsigned int out_value;     // 12 bits valid
};

unsigned int test_value_array[MAX_DEVICE_NUM]= {0x7f,0x7f,0x7f,0x7f,0x7f,0x7f};

int main(int argc, char *argv[])
{
    int i, ret;
    struct data_unit current_unit;
    int device_handle = -1;

    // open the device handle
    device_handle = open(DEVICE_NAME, O_RDWR, 0);
    if (device_handle < 0) {
        printf("open the device failed!\n");
        return -1;
    }

    // prepare data
    for(i = 0; i < MAX_DEVICE_NUM-1; ++i) {
        current_unit.device_index = i;
        current_unit.out_value = test_value_array[i];
        ret = ioctl(device_handle, SET_CHANNEL_DATA, &current_unit);
        if (0 == ret) {
            printf("ioctl failed!\n");
            close(device_handle);
            return -1;
        }
    }

    // output
    ret = write(device_handle, NULL, NULL);

    // close the device handle
    close(device_handle);

    return 0;
}

[/pre]
 

References

OMAP-L138 C6000 DSP+ARM Processor Technical Reference Manual

Click to access spruh77b.pdf

AD5410_5420 data sheet

Click to access AD5410_5420.pdf

OMAP-L138 Software Design Guide
http://processors.wiki.ti.com/index.php/OMAP-L138_Software_Design_Guide#Linux_Drivers:

Linux Kernel and Driver Development Training

Click to access linux-kernel-slides.pdf

Avoid memory leak using shared_ptr

Shared_ptr was first designed and implemented in boost, and finally became part of the C++11 Standard as std::shared_ptr. Boost defines it as, “The shared_ptr class template stores a pointer to a dynamically allocated object, typically with a C++ new-expression. The object pointed to is guaranteed to be deleted when the last shared_ptr pointing to it is destroyed or reset.” This auto-delete character makes it possible to avoid memory leak.

It’s not hard to use shared_ptr. We just create an object, pass it to the shared_ptr, use the pointer as a common one, and have no need to care about when to delete it. Here is a simple example:

#include <iostream>
#include <memory>

class UserDefinedClass{
  public:
    UserDefinedClass();
    int     GetValue();
    void    SetValue(const int value);

  private:
    int value_;
};

UserDefinedClass::UserDefinedClass() {
    value_ = 0;
}

int UserDefinedClass::GetValue() {
    return value_;
}

void UserDefinedClass::SetValue(const int value) {
    value_  =    value;
}

int main() {
    std::shared_ptr<UserDefinedClass> sp1;
    std::shared_ptr<UserDefinedClass> sp2(new UserDefinedClass);

    sp1 = sp2;
    sp1->SetValue(10);
    sp2->SetValue(20);

     if (sp1) std::cout << "sp1: " << sp1->GetValue() << '\n';
     if (sp2) std::cout << "sp2: " << sp2->GetValue() << '\n';

     return 0;
}

In the above code, we can eliminate the “new” operation by using “make_shared” to get an efficiency benefit by consolidating allocation.

When considering arrays, we can combined the vector with shared_ptr. The following example will show us how to do.

#include <cstdio>
#include <cstring>
#include <memory>
#include <thread>
#include <mutex>
#include <vector>
#include <semaphore.h>

using namespace std;

#define   MAX_CACHE_SIZE    100
#define   MAX_BUFFER_SIZE   (1024*1024)

struct MemoryUnit{
    int size;
    // there is a space between the two '>' characters
    shared_ptr<vector<char> > buffer;
};

struct MemoryCache{
    pthread_mutex_t  mutex_cache;
    sem_t    sem_empty;
    sem_t    sem_full;
    vector<struct MemoryUnit> cached_vector;
};

class SharedPtrDemo{
  public:
    SharedPtrDemo();
    ~SharedPtrDemo();
    void Start();

  private:
    struct MemoryCache  memory_cache;
    pthread_t  producer_thread_id;
    pthread_t  consumer_thread_id;
    void  Producer();
    void  Consumer();

    static void* ProducerEntrance(void* args)  {
        if (NULL == args) return NULL;

        SharedPtrDemo* instance = static_cast<SharedPtrDemo*>(args);
        if (instance)
            instance->Producer();

        return NULL;
    }

    static void* ConsumerEntrance(void* args) {
        if (NULL == args) return NULL;

        SharedPtrDemo* instance = static_cast<SharedPtrDemo*>(args);
        if (instance)
            instance->Consumer();

        return NULL;
    }
};

SharedPtrDemo::SharedPtrDemo() {
    pthread_mutex_init(&memory_cache.mutex_cache, NULL);
    sem_init(&memory_cache.sem_empty, 0, MAX_CACHE_SIZE);
    sem_init(&memory_cache.sem_full, 0, 0);
}

SharedPtrDemo::~SharedPtrDemo() {
    pthread_mutex_destroy(&memory_cache.mutex_cache);
    sem_destroy(&memory_cache.sem_empty);
    sem_destroy(&memory_cache.sem_full);
}

void SharedPtrDemo::Start() {
    pthread_create(&producer_thread_id,
                   NULL,
                   SharedPtrDemo::ProducerEntrance,
                   (SharedPtrDemo* )this);

    pthread_create(&consumer_thread_id,
                   NULL,
                   SharedPtrDemo::ConsumerEntrance,
                   (SharedPtrDemo* )this);
}

void SharedPtrDemo::Producer() {
    unsigned int  produced_count = 0;

    while(1) {
        ++produced_count;
        struct MemoryUnit produced_unit;
        memset(&produced_unit, 0x00, sizeof(produced_unit));

        srand(time(0)+produced_count);
        int memory_size = rand() % MAX_BUFFER_SIZE + 1;
        produced_unit.buffer = make_shared<vector<char> >(memory_size);

        // put whatever you want to the buffer
        memcpy(&(*produced_unit.buffer)[0], &produced_count,sizeof(produced_count));
        produced_unit.size =  memory_size;

        sem_wait(&memory_cache.sem_empty);
        pthread_mutex_lock(&memory_cache.mutex_cache);
        memory_cache.cached_vector.push_back(produced_unit);
        pthread_mutex_unlock(&memory_cache.mutex_cache);
        sem_post(&memory_cache.sem_full);
    }
}

void SharedPtrDemo:: Consumer() {
    int passed_value = 0;
    unsigned int consumed_count = 0;

    while(1) {
        struct MemoryUnit consumed_unit;
        memset(&consumed_unit, 0x00, sizeof(consumed_unit));

        sem_wait(&memory_cache.sem_full);
        pthread_mutex_lock(&memory_cache.mutex_cache);
        vector<struct MemoryUnit>::iterator vector_it
            = memory_cache.cached_vector.begin();
        consumed_unit = *vector_it;
        memory_cache.cached_vector.erase(vector_it);
        pthread_mutex_unlock(&memory_cache.mutex_cache);
        sem_post(&memory_cache.sem_empty);

        // use the buffer, then it will be destroyed automatically
        memcpy(&passed_value, &(*consumed_unit.buffer)[0], sizeof(passed_value));
        printf("Consumed %d units, and the value = %d\n",
               ++consumed_count, passed_value);
    }
}

int main() {
    SharedPtrDemo demo;
    demo.Start();

    getchar();

    return 0;
}

Convenient as it is, there are some potential dangers when using shared_ptr.

1)Avoid the circular references

Because the implementation of shared_ptr uses reference counting, while circular references will make the mechanism fail. Here is an example:

#include <memory>
#include <iostream>

using namespace std;

class B;
class A{
  public:
    shared_ptr<B> b_ptr;
    A() {cout << "A() \n";}
    ~A() {cout << "~A() \n";}
};

class B{
  public:
    shared_ptr<A> a_ptr;  // use weak_ptr to break the cycle
    B() {cout << "B() \n";}
    ~B() {cout << "~B() \n";}
};

int main() {
    shared_ptr<A> a_ = make_shared<A>();
    shared_ptr<B> b_ = make_shared<B>();
    a_->b_ptr = b_;
    b_->a_ptr = a_;

    return 0;
}

This program will end up with print “A()” and “B()”. There’s no destruction of A and B. There will surely lead to memory leak, and make a long-lived program be killed eventually. To break the cycle, we can use weak_ptr.

2)Always use make_shared instead of new

Make_shared will help us to achieve high performance and avoid memory leak in some situation. For example, code like F(std::shared_ptr<T>(new T), g()) might cause a memory leak if g throws an exception because g() may be called after new T and before the constructor of shared_ptr<T>. While this doesn’t occur in F(std::make_shared<T>(), g()), since two function calls are never interleaved.

3) Avoid to use the raw pointer

Calling the get() function to get the raw pointer or passing a reference of a raw pointer to a shared_ptr would be dangerous, because the internal count won’t increase.

As we’ve learned the memory fragmentation problem before. Will there be the same problem with shared_ptr? Well, we can see that shared_ptr uses new to allocate memory on the heap. And some memory usage patterns, those with many long-lived small objects, are trend to cause memory fragmentation. And we can use third party memory libraries such as jemalloc and tcmalloc to avoid it.

Reference

1)Official definition of shared_ptr class template

http://www.boost.org/doc/libs/1_61_0/libs/smart_ptr/shared_ptr.htm

2)Potential dangers when using boost::shared_ptr

http://stackoverflow.com/questions/701456/what-are-potential-dangers-when-using-boostshared-ptr

3)Why should we almost always use make_shared

GotW #89 Solution: Smart Pointers

4)Performance compared with make_shared and new

http://tech-foo.blogspot.jp/2012/04/experimenting-with-c-stdmakeshared.html

Fix the Memory Fragmentation of Embedded Systems with TCMalloc

While developing a video processing device, we met an eccentric phenomenon- the amount of memory used by our program kept on growing. After several weeks of running, the program triggered the OOM(Out of Memory) condition, and was killed by the Linux Operating System. At first I doubted that there might be a memory leak. So I checked every calls about malloc() and free(), but found no problem. I then used detect tools such as valgrind, but got the same result. So I decided to dig into the GNU C library’s (glibc’s) malloc library, to find out any improperly use of the it.

In the source code I read, “Also, in practice, programs tend to have runs of either small or large requests, but less often mixtures, so consolidation is not invoked all that often in most programs. And the programs that it is called frequently in otherwise tend to fragment. ” It also mentioned that for long-lived programs, special attention should be paid to the trim threshold and the mmap control parameters.

The memory using style of our program didn’t conform to those rules. It’s mixed with small and large requests, and created many objects reserved for a long time. After some searching, I found that many people have met the similar problem, and they used other libraries to deal with it.

There are some third party memory manage libraries, such as: tcmalloc, jemalloc, hoard, lockless, etc. Two of them are very famous because Google used tcmalloc and Facebook used jemalloc. I chose tcmalloc for our project, because they said it’s “The fastest malloc we’ve seen; works particularly well with threads and STL”. However, we still met the memory growing problem. After some deep study, I found we can use “MallocExtension::instance()->ReleaseFreeMemory()” frequently to keep the memory stable. Till this time of the day, our devices have been running continuously for over two years.

Third parties’ implementation emphasize fragmentation avoidance, and also have higher performance. The following program gives a good example. It will take 1177 seconds and 308 seconds by using glibc 2.15 and tcmalloc 1.8.2 separately.

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <time.h>

#define MAX_OBJECT_NUMBER       (1024)
#define MAX_MEMORY_SIZE         (1024*100)

struct BufferUnit{
   int   size;
   char* data;
};

struct BufferUnit   buffer_units[MAX_OBJECT_NUMBER];

void MallocBuffer(int buffer_size) {

for(int i=0; i<MAX_OBJECT_NUMBER; ++i)  {
    if (NULL != buffer_units[i].data)   continue;

    buffer_units[i].data = (char*)malloc(buffer_size);
    if (NULL == buffer_units[i].data)  continue;

    memset(buffer_units[i].data, 0x01, buffer_size);
    buffer_units[i].size = buffer_size;
    }
}

void FreeHalfBuffer(bool left_half_flag) {
    int half_index = MAX_OBJECT_NUMBER / 2;
    int min_index = 0;
    int max_index = MAX_OBJECT_NUMBER-1;
    if  (left_half_flag)
        max_index =  half_index;
    else
        min_index = half_index;

    for(int i=min_index; i<=max_index; ++i) {
        if (NULL == buffer_units[i].data) continue;

        free(buffer_units[i].data);
        buffer_units[i].data =  NULL;
        buffer_units[i].size = 0;
    }
}

int main() {
    memset(&buffer_units, 0x00, sizeof(buffer_units));
    int decrease_buffer_size = MAX_MEMORY_SIZE;
    bool left_half_flag   =   false;
    time_t  start_time = time(0);
    while(1)  {
        MallocBuffer(decrease_buffer_size);
        FreeHalfBuffer(left_half_flag);
        left_half_flag = !left_half_flag;
        --decrease_buffer_size;
        if (0 == decrease_buffer_size) break;
    }
    FreeHalfBuffer(left_half_flag);
    time_t end_time = time(0);
    long elapsed_time = difftime(end_time, start_time);

    printf("Used %ld seconds. \n", elapsed_time);
    return 1;
}

References

Fragmentation of memory:
https://en.wikipedia.org/wiki/Fragmentation_of_memory

How Firefox fix memory fragmentation problem:
Memory fragmentation

Malloc Internals:
https://sourceware.org/glibc/wiki/MallocInternals